In [405]:
import pandas as pd
import numpy as np
import re

In [406]:
# Read in the file with Minimum wage data
data = pd.read_csv('Min_Wage_Data.csv')
data.head()

Unnamed: 0,Year,State,Table_Data,Footnote,High.Value,Low.Value,CPI.Average,High.2018,Low.2018
0,1968,Alabama,...,,0.0,0.0,34.783333,0.0,0.0
1,1968,Alaska,2.10,,2.1,2.1,34.783333,15.12,15.12
2,1968,Arizona,18.72 - 26.40/wk(b),(b),0.66,0.468,34.783333,4.75,3.37
3,1968,Arkansas,1.25/day(b),(b),0.15625,0.15625,34.783333,1.12,1.12
4,1968,California,1.65(b),(b),1.65,1.65,34.783333,11.88,11.88


In [407]:
# Select the columns needed
data = data[['Year', 'State', 'High.Value', 'CPI.Average']]
data.head()

Unnamed: 0,Year,State,High.Value,CPI.Average
0,1968,Alabama,0.0,34.783333
1,1968,Alaska,2.1,34.783333
2,1968,Arizona,0.66,34.783333
3,1968,Arkansas,0.15625,34.783333
4,1968,California,1.65,34.783333


In [408]:
print data.State.unique()

['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Federal (FLSA)' 'Florida'
 'Georgia' 'Guam' 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas'
 'Kentucky' 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan'
 'Minnesota' 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada'
 'New Hampshire' 'New Jersey' 'New Mexico' 'New York' 'North Carolina'
 'North Dakota' 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Puerto Rico'
 'Rhode Island' 'South Carolina' 'South Dakota' 'Tennessee' 'Texas'
 'U.S. Virgin Islands' 'Utah' 'Vermont' 'Virginia' 'Washington'
 'West Virginia' 'Wisconsin' 'Wyoming']


In [409]:
# Select only years from 2007 to 2017
years = list(range(2007, 2018))
rows = [i for i in data.index if data.Year[i] in years]

data.set_index('Year')

wage_data = data.loc[rows].reset_index().drop('index', axis=1)
wage_data.columns = ['Year', 'State', 'Min_Wage', 'CPI']
wage_data.head()

Unnamed: 0,Year,State,Min_Wage,CPI
0,2007,Alabama,0.0,207.342417
1,2007,Alaska,7.15,207.342417
2,2007,Arizona,6.75,207.342417
3,2007,Arkansas,6.25,207.342417
4,2007,California,7.5,207.342417


In [410]:
# Create a dictionary with the federal minimum wage to be used for filling in missing values 
fed = {}
fed_data = wage_data[wage_data.State=='Federal (FLSA)']
fed_data = fed_data[['Year', 'Min_Wage']]

for i in fed_data.index:
    fed[fed_data.Year[i]] = fed_data.Min_Wage[i]

fed

{2007: 5.1500000000000004,
 2008: 5.8499999999999996,
 2009: 6.5499999999999998,
 2010: 7.25,
 2011: 7.25,
 2012: 7.25,
 2013: 7.25,
 2014: 7.25,
 2015: 7.25,
 2016: 7.25,
 2017: 7.25}

In [416]:
# For any minimum wage values that are zero, fill in with the federal value
for i in wage_data.index:
    year = wage_data.Year[i]
    if wage_data.Min_Wage[i]==0:
        wage_data.Min_Wage[i] = fed[year]
        
wage_data.head()

Unnamed: 0,State,Year,Min_Wage,CPI
0,Alabama,2007,5.15,207.342417
1,Alaska,2007,7.15,207.342417
2,Arizona,2007,6.75,207.342417
3,Arkansas,2007,6.25,207.342417
4,California,2007,7.5,207.342417


In [417]:
wage_data.shape

(561, 4)

In [418]:
# Drop the data for U.S. territories, and the federal data
wage_data = wage_data.set_index('State').drop(['Guam','Puerto Rico','U.S. Virgin Islands','Federal (FLSA)'] , axis=0).reset_index()

In [420]:
wage_data.head()

Unnamed: 0,State,Year,Min_Wage,CPI
0,Alabama,2007,5.15,207.342417
1,Alaska,2007,7.15,207.342417
2,Arizona,2007,6.75,207.342417
3,Arkansas,2007,6.25,207.342417
4,California,2007,7.5,207.342417


In [424]:
wage_data.to_csv('Clean_Min_Wage_Data.csv')