In [6]:
import pandas as pd
import numpy as np

In [7]:
ue_data = pd.read_csv('CleanDataUnemployment.csv')
ed_data = pd.read_csv('CleanDataEducation.csv')

In [8]:
ue_data = ue_data[['State', 'Area', 'Year', 'UnemploymentRate', 'Region']] 
# Convert state to upper case (Colorado was 'Co')
ue_data['State'] = map(str.upper, ue_data['State'])   
ue_data.head()

Unnamed: 0,State,Area,Year,UnemploymentRate,Region
0,AL,Alabama,2007,4.0,South
1,AL,Alabama,2008,5.7,South
2,AL,Alabama,2009,11.0,South
3,AL,Alabama,2010,10.5,South
4,AL,Alabama,2011,9.6,South


In [9]:
ed_data.head()
   
# Replace value that was misspelled
ed_data.Area[1138]='Louisiana'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [10]:
ue_data.shape

(35057, 5)

In [11]:
ed_data.shape

(3203, 7)

In [12]:
# Merge the dataframes
clean_df = pd.merge(ue_data, ed_data, on=['State', 'Area']).drop('Unnamed: 0', axis=1)
clean_df.head()

Unnamed: 0,State,Area,Year,UnemploymentRate,Region,LHS,HS,SC,BD
0,AL,Alabama,2007,4.0,South,15.2,31.0,29.8,24.0
1,AL,Alabama,2008,5.7,South,15.2,31.0,29.8,24.0
2,AL,Alabama,2009,11.0,South,15.2,31.0,29.8,24.0
3,AL,Alabama,2010,10.5,South,15.2,31.0,29.8,24.0
4,AL,Alabama,2011,9.6,South,15.2,31.0,29.8,24.0


In [13]:
# Education data is only for 2012 - 2016. Insert nan for all other years
years = ['2012', '2013', '2014', '2015', '2016']
lhs = []
hs = []
sc = []
bd = []
for i in clean_df.index:
    if str(clean_df.Year[i]) in years:
        lhs.append(clean_df.LHS[i])
        hs.append(clean_df.HS[i])
        sc.append(clean_df.SC[i])
        bd.append(clean_df.BD[i])
    else:
        lhs.append(np.nan)
        hs.append(np.nan)
        sc.append(np.nan)
        bd.append(np.nan)


In [14]:
clean_df.LHS = pd.Series(lhs)
clean_df.HS = pd.Series(hs)
clean_df.SC = pd.Series(sc)
clean_df.BD = pd.Series(bd)

In [15]:
# Calculate the change in unemployment rate for each year, and add a column with this data
rate_change = []
for i in clean_df.index:
    if str(clean_df.Year[i]) == '2007':
        rate_change.append(np.nan)
    elif str(clean_df.Year[i]) != '2007':
        change = clean_df.UnemploymentRate[i] - clean_df.UnemploymentRate[i-1]
        rate_change.append(change)

rate_change = pd.Series(rate_change)

clean_df['Change'] = rate_change.values

In [16]:
# Change the order of the columns
clean_df = clean_df[['State', 'Area', 'Year', 'UnemploymentRate', 'Change', 'Region', 'LHS', 'HS', 'SC', 'BD']]

In [17]:
print clean_df.shape
clean_df.head()

(34947, 10)


Unnamed: 0,State,Area,Year,UnemploymentRate,Change,Region,LHS,HS,SC,BD
0,AL,Alabama,2007,4.0,,South,,,,
1,AL,Alabama,2008,5.7,1.7,South,,,,
2,AL,Alabama,2009,11.0,5.3,South,,,,
3,AL,Alabama,2010,10.5,-0.5,South,,,,
4,AL,Alabama,2011,9.6,-0.9,South,,,,


In [18]:
clean_df.to_csv('CleanData.csv')

**Read in the file with the population and migration data and merge it with the existing dataframe.**

In [19]:
pop_data = pd.read_csv('Pop_Mig_Data.csv').drop('Unnamed: 0', axis=1)
print pop_data.shape
pop_data.head()

(22393, 6)


Unnamed: 0,State,Area,Year,Pop_Est,Int_Mig_Rate,Dom_Mig_Rate
0,AL,Alabama,2011,4798649,0.9,-0.5
1,AL,Alabama,2012,4813946,0.9,0.0
2,AL,Alabama,2013,4827660,0.8,0.5
3,AL,Alabama,2014,4840037,0.8,-0.2
4,AL,Alabama,2015,4850858,1.0,-0.3


In [20]:
clean_data2 = pd.merge(clean_df, pop_data, on=['State', 'Area', 'Year'], how='left')
clean_data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35143 entries, 0 to 35142
Data columns (total 13 columns):
State               35143 non-null object
Area                35143 non-null object
Year                35143 non-null int64
UnemploymentRate    35143 non-null float64
Change              31966 non-null float64
Region              35132 non-null object
LHS                 16025 non-null float64
HS                  16025 non-null float64
SC                  16025 non-null float64
BD                  16025 non-null float64
Pop_Est             22421 non-null float64
Int_Mig_Rate        22421 non-null float64
Dom_Mig_Rate        22421 non-null float64
dtypes: float64(9), int64(1), object(3)
memory usage: 3.8+ MB


In [21]:
income_data = pd.read_csv('StateIncomeData.csv').drop('Unnamed: 0', axis=1).dropna()
income_data.Year = income_data.Year.astype(int)

In [26]:
print income_data.shape
print clean_data2.shape
income_data.info()

for i in income_data.index:
    if income_data.Area[i]=='D.C.':
        income_data['Area'][i]='District of Columbia'

(561, 3)
(35143, 13)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 561 entries, 0 to 560
Data columns (total 3 columns):
Area            561 non-null object
Year            561 non-null int64
MedianIncome    561 non-null int64
dtypes: int64(2), object(1)
memory usage: 37.5+ KB


In [27]:
data2 = clean_data2.merge(income_data, on=['Area', 'Year'], how='left').reset_index()
data2.head()

Unnamed: 0,index,State,Area,Year,UnemploymentRate,Change,Region,LHS,HS,SC,BD,Pop_Est,Int_Mig_Rate,Dom_Mig_Rate,MedianIncome
0,0,AL,Alabama,2007,4.0,,South,,,,,,,,42212.0
1,1,AL,Alabama,2008,5.7,1.7,South,,,,,,,,44476.0
2,2,AL,Alabama,2009,11.0,5.3,South,,,,,,,,39980.0
3,3,AL,Alabama,2010,10.5,-0.5,South,,,,,,,,40933.0
4,4,AL,Alabama,2011,9.6,-0.9,South,,,,,4798649.0,0.9,-0.5,42590.0


In [28]:
data2.to_csv('CleanData2.csv')

In [29]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35143 entries, 0 to 35142
Data columns (total 15 columns):
index               35143 non-null int64
State               35143 non-null object
Area                35143 non-null object
Year                35143 non-null int64
UnemploymentRate    35143 non-null float64
Change              31966 non-null float64
Region              35132 non-null object
LHS                 16025 non-null float64
HS                  16025 non-null float64
SC                  16025 non-null float64
BD                  16025 non-null float64
Pop_Est             22421 non-null float64
Int_Mig_Rate        22421 non-null float64
Dom_Mig_Rate        22421 non-null float64
MedianIncome        790 non-null float64
dtypes: float64(10), int64(2), object(3)
memory usage: 4.0+ MB


In [36]:
wage_data = pd.read_csv('Clean_Min_Wage_Data.csv').drop('Unnamed: 0', axis=1)
wage_data.head()

Unnamed: 0,State,Year,Min_Wage,CPI
0,Alabama,2007,5.15,207.342417
1,Alaska,2007,7.15,207.342417
2,Arizona,2007,6.75,207.342417
3,Arkansas,2007,6.25,207.342417
4,California,2007,7.5,207.342417


In [82]:
states = data2.State.unique()
areas = wage_data.State.unique()
s_df = pd.DataFrame()
s_df['State'] = states
s_df['Area'] = areas

state_dict = {}
for i in s_df.index:
    state_dict[s_df.Area[i]] = s_df.State[i] 

s_list = []
for i in wage_data.index:
    s = wage_data.State[i]
    s_list.append(state_dict[s])

wage_data['State'] = s_list
wage_data.head()

Unnamed: 0,State,Year,Min_Wage,CPI
0,AL,2007,5.15,207.342417
1,AK,2007,7.15,207.342417
2,AZ,2007,6.75,207.342417
3,AR,2007,6.25,207.342417
4,CA,2007,7.5,207.342417


In [76]:
previous_data = pd.read_csv('FinalData.csv').drop('Unnamed: 0', axis=1)
previous_data.head()

Unnamed: 0,State,Area,Year,UnemploymentRate,Change,Region,LHS,HS,SC,BD,Pop_Est,MedianIncome,RegionCode,StateCode,AreaCode
0,AL,Alabama,2007,4.0,0.0,South,15.2,31.0,29.8,24.0,4737752,42212.0,2,1,9
1,AL,Alabama,2008,5.7,1.7,South,15.2,31.0,29.8,24.0,4752904,44476.0,2,1,9
2,AL,Alabama,2009,11.0,5.3,South,15.2,31.0,29.8,24.0,4768103,39980.0,2,1,9
3,AL,Alabama,2010,10.5,-0.5,South,15.2,31.0,29.8,24.0,4783352,40933.0,2,1,9
4,AL,Alabama,2011,9.6,-0.9,South,15.2,31.0,29.8,24.0,4798649,42590.0,2,1,9


In [81]:
new_data = previous_data.merge(wage_data, on=['State', 'Year'], how='left')
new_data.head()

Unnamed: 0,State,Area,Year,UnemploymentRate,Change,Region,LHS,HS,SC,BD,Pop_Est,MedianIncome,RegionCode,StateCode,AreaCode,Min_Wage,CPI
0,AL,Alabama,2007,4.0,0.0,South,15.2,31.0,29.8,24.0,4737752,42212.0,2,1,9,5.15,207.342417
1,AL,Alabama,2008,5.7,1.7,South,15.2,31.0,29.8,24.0,4752904,44476.0,2,1,9,5.85,215.3025
2,AL,Alabama,2009,11.0,5.3,South,15.2,31.0,29.8,24.0,4768103,39980.0,2,1,9,6.55,214.537
3,AL,Alabama,2010,10.5,-0.5,South,15.2,31.0,29.8,24.0,4783352,40933.0,2,1,9,7.25,218.0555
4,AL,Alabama,2011,9.6,-0.9,South,15.2,31.0,29.8,24.0,4798649,42590.0,2,1,9,7.25,224.939167


In [83]:
new_data.to_csv('FinalData2.csv')