In [1]:
#Education.xls file is the first data set
import pandas as pd
file_name='All excel sheets\Education.xls'
xl = pd.ExcelFile(file_name)
print(xl.sheet_names)

['Education 1970 to 2016']


In [3]:
# The first three lines are comments
df= xl.parse(0,header=4)
#Columns 1, 2, and 5 are state, area name(US, State, County), and the rural-urban continuum codes, respectively
df1=df.iloc[:,[1,2,5]]
# The last four columns are the percentages of adults with less than high school diploma, high school diploma, some college
# or associate's degree, and Bachelor's or higher degrees
df2=df.iloc[:,-4:]

df_concat=pd.concat([df1,df2],axis='columns')
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3283 entries, 0 to 3282
Data columns (total 7 columns):
State                                                                         3283 non-null object
Area name                                                                     3283 non-null object
2013 Rural-urban Continuum Code                                               3221 non-null float64
Percent of adults with less than a high school diploma, 2012-2016             3273 non-null float64
Percent of adults with a high school diploma only, 2012-2016                  3273 non-null float64
Percent of adults completing some college or associate's degree, 2012-2016    3273 non-null float64
Percent of adults with a bachelor's degree or higher, 2012-2016               3273 non-null float64
dtypes: float64(5), object(2)
memory usage: 205.2+ KB


In [3]:
# The info above shows only "2013 Rural-urban Continuum Code" has null objects. I check on the rows with null values
df_concat.loc[df_concat.isnull().any(axis=1),['State','Area name']]

Unnamed: 0,State,Area name
0,US,United States
1,AL,Alabama
69,AK,Alaska
70,AK,Aleutian Islands
86,AK,Kuskokwim Division
94,AK,Prince of Wales-Outer Ketchikan Census Area
97,AK,Skagway-Yakutat-Angoon Census Area
98,AK,Skagway-Hoonah-Angoon Census Area
100,AK,Upper Yukon Division
103,AK,Wrangell-Petersburg Census Area


In [4]:
# As it can be seen, missing data is from either US and States, or some counties from Alaska and Puerto Rico. I was going to
# remove data from US and States anyway since they are useless for my project. In addition, eliminating around 10 counties from 
# more than 3200 counties means loosing around %0.3 of data set which seems fine to me
df_education=df_concat.dropna()
# Columns are renamed for convention
df_education=df_education.rename(columns={'State':'state','Area name':'county','2013 Rural-urban Continuum Code':'rural_urban_continuum_code',
                             'Percent of adults with less than a high school diploma, 2012-2016':'less_than_high_school_percentage_adults_only',
                            'Percent of adults with a high school diploma only, 2012-2016':'high_school_diploma_percentage_adults_only',
                            'Percent of adults completing some college or associate\'s degree, 2012-2016':'some_college_or_associate_degree_percentage',
                            'Percent of adults with a bachelor\'s degree or higher, 2012-2016':'bachelors_or_higher_percentage'})

df_education.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3220 entries, 2 to 3282
Data columns (total 7 columns):
state                                           3220 non-null object
county                                          3220 non-null object
rural_urban_continuum_code                      3220 non-null float64
less_than_high_school_percentage_adults_only    3220 non-null float64
high_school_diploma_percentage_adults_only      3220 non-null float64
some_college_or_associate_degree_percentage     3220 non-null float64
bachelors_or_higher_percentage                  3220 non-null float64
dtypes: float64(5), object(2)
memory usage: 201.2+ KB


In [5]:
# check if there is any rural-urban continuum code except integers from 1-9
print('Rural-urban Continuum Codes')
print(df_education.rural_urban_continuum_code.unique())
# check if the sum of the percentages is not close to 100 for any row
total=df_education.iloc[:,-4:].sum(axis=1)
print('\nRows with Total Percentage Not Close to 100')
print(total[(total<99.5) | (total>100.5)])
# check if there is more than one row for each county
state_county=df_education.state+'-'+df_education.county
print('\nTop Five Counties with Most Counts')
print(state_county.value_counts().head())
# If the top ones have only one count, it means there is no duplicate row for any county

Rural-urban Continuum Codes
[2. 3. 6. 1. 9. 7. 8. 4. 5.]

Rows with Total Percentage Not Close to 100
Series([], dtype: float64)

Top Five Counties with Most Counts
TX-Sterling County     1
MI-Menominee County    1
CO-Mesa County         1
OH-Williams County     1
CO-Montezuma County    1
dtype: int64


In [6]:
# read the poverty file
file_name='All excel sheets\PovertyEstimates.xls'
xl = pd.ExcelFile(file_name)
print(xl.sheet_names)

['Poverty Data 2016', 'Variable Descriptions']


In [7]:
# read the first sheet, collect three columns and rename them
df_poverty=xl.parse(0,header=2)
df_poverty=df_poverty.loc[:,['State','Area_Name','PCTPOVALL_2016']].rename(columns={'State':'state','Area_Name':'county',
                                                                                    'PCTPOVALL_2016':'poverty_percentage'})
df_poverty.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3194 entries, 0 to 3193
Data columns (total 3 columns):
state                 3194 non-null object
county                3194 non-null object
poverty_percentage    3193 non-null float64
dtypes: float64(1), object(2)
memory usage: 99.8+ KB


In [8]:
# There is only one null cell, so we could just remove it
df_poverty.dropna(inplace=True)
# check if there is any percentage above 100
print('Rows with Percentage more than 100')
print(df_poverty[df_poverty.poverty_percentage>100])
# check if there are duplicate rows for any county
state_county=df_poverty.state+'-'+df_poverty.county
print('\nTop Five Counties with Most Counts')
print(state_county.value_counts().head())
print(type(state_county))

Rows with Percentage more than 100
Empty DataFrame
Columns: [state, county, poverty_percentage]
Index: []

Top Five Counties with Most Counts
DC-District of Columbia    2
TX-Sterling County         1
CO-Montezuma County        1
TN-Washington County       1
DE-Delaware                1
dtype: int64
<class 'pandas.core.series.Series'>


In [9]:
# It shows there are two rows for District of Columbia
print('Duplicate Rows')
print(df_poverty[df_poverty.county=='District of Columbia'])
# drop the duplicate
df_poverty.drop_duplicates(inplace=True)

Duplicate Rows
    state                county  poverty_percentage
328    DC  District of Columbia                19.0
329    DC  District of Columbia                19.0


In [10]:
# Merge education and poverty dataframes
df_merge_education_poverty=df_education.merge(df_poverty,on=['state','county'])
df_merge_education_poverty.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3131 entries, 0 to 3130
Data columns (total 8 columns):
state                                           3131 non-null object
county                                          3131 non-null object
rural_urban_continuum_code                      3131 non-null float64
less_than_high_school_percentage_adults_only    3131 non-null float64
high_school_diploma_percentage_adults_only      3131 non-null float64
some_college_or_associate_degree_percentage     3131 non-null float64
bachelors_or_higher_percentage                  3131 non-null float64
poverty_percentage                              3131 non-null float64
dtypes: float64(6), object(2)
memory usage: 220.1+ KB


In [11]:
#read the population file
file_name='All excel sheets\PopulationEstimates.xls'
xl=pd.ExcelFile(file_name)
print(xl.sheet_names)

['Population Estimates 2010-2016', 'Variable Descriptions']


In [12]:
# read the first sheet, collect three columns and rename them
df_population=xl.parse(0)
df_population=df_population[['State','Area_Name','POP_ESTIMATE_2016']].rename(columns={'State':'state','Area_Name':'county',
                                                                                       'POP_ESTIMATE_2016'
                                                                                      :'population_estimate'})
print(df_population.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3273 entries, 0 to 3272
Data columns (total 3 columns):
state                  3273 non-null object
county                 3273 non-null object
population_estimate    3273 non-null int64
dtypes: int64(1), object(2)
memory usage: 76.8+ KB
None


In [13]:
# There is no null value in the population table. 
# check if there is any duplicate
state_county=df_population.state+'-'+df_population.county
print('\nTop Five Counties with Most Counts')
print(state_county.value_counts().head())


Top Five Counties with Most Counts
DC-District of Columbia    2
AR-Ouachita County         1
PA-Wyoming County          1
VA-Carroll County          1
FL-Florida                 1
dtype: int64


In [14]:
# Again it shows there are two rows for District of Columbia
print('Duplicate Rows')
print(df_population[df_population.county=='District of Columbia'])
# drop the duplicate
df_population.drop_duplicates(inplace=True)

Duplicate Rows
    state                county  population_estimate
328    DC  District of Columbia               681170
329    DC  District of Columbia               681170


In [15]:
#Merge all tables together
df_merge_education_poverty_population=df_merge_education_poverty.merge(df_population,on=['state','county'])
df_merge_education_poverty_population.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3128 entries, 0 to 3127
Data columns (total 9 columns):
state                                           3128 non-null object
county                                          3128 non-null object
rural_urban_continuum_code                      3128 non-null float64
less_than_high_school_percentage_adults_only    3128 non-null float64
high_school_diploma_percentage_adults_only      3128 non-null float64
some_college_or_associate_degree_percentage     3128 non-null float64
bachelors_or_higher_percentage                  3128 non-null float64
poverty_percentage                              3128 non-null float64
population_estimate                             3128 non-null int64
dtypes: float64(6), int64(1), object(2)
memory usage: 244.4+ KB


In [39]:
# read the Unemployment sheet
file_name=r'All excel sheets\Unemployment.xls'
xl=pd.ExcelFile(file_name)
print(xl.sheet_names)

['Unemployment Med HH Inc', 'Variable Descriptions']


In [40]:
df_unemployment=xl.parse(0)
df_unemployment=df_unemployment[['State','Area_name','Unemployment_rate_2016']].rename(columns={'State':'state','Area_name':'county',
                                                                                       'Unemployment_rate_2016'
                                                                                      :'unemployment_rate'})
print(df_unemployment.info())
df_unemployment.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3274 entries, 0 to 3273
Data columns (total 3 columns):
state                3274 non-null object
county               3274 non-null object
unemployment_rate    3271 non-null float64
dtypes: float64(1), object(2)
memory usage: 76.8+ KB
None


Unnamed: 0,state,county,unemployment_rate
0,AL,Alabama,6.0
1,AL,"Autauga County, AL",5.3
2,AL,"Baldwin County, AL",5.4
3,AL,"Barbour County, AL",8.6
4,AL,"Bibb County, AL",6.6


In [41]:
# There are two problems: there three null unemployment rates, and also the county names include the state name
df_unemployment.dropna(inplace=True)
#Remove the state name from county name
df_unemployment.county=df_unemployment.county.map(lambda x: x[:len(x)-4])
#Check if there is any duplicate
state_county=df_unemployment.state+'-'+df_unemployment.county
print('\nTop Five Counties with Most Counts')
print(state_county.value_counts().head())


Top Five Counties with Most Counts
DC-District of Colu     2
NJ-Essex County         1
OH-Morrow County        1
NC-Warren County        1
NY-Montgomery County    1
dtype: int64


In [42]:
#Remove the duplicate and rename the District of Columbia
df_unemployment.drop_duplicates(inplace=True)
df_unemployment.loc[df_unemployment['county']=='District of Colu','county']='District of Columbia'

In [47]:
#Merge all tables together
df_merge_all=df_merge_education_poverty_population.merge(df_unemployment,on=['state','county'])
df_merge_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3121 entries, 0 to 3120
Data columns (total 10 columns):
state                                           3121 non-null object
county                                          3121 non-null object
rural_urban_continuum_code                      3121 non-null float64
less_than_high_school_percentage_adults_only    3121 non-null float64
high_school_diploma_percentage_adults_only      3121 non-null float64
some_college_or_associate_degree_percentage     3121 non-null float64
bachelors_or_higher_percentage                  3121 non-null float64
poverty_percentage                              3121 non-null float64
population_estimate                             3121 non-null int64
unemployment_rate                               3121 non-null float64
dtypes: float64(7), int64(1), object(2)
memory usage: 268.2+ KB


In [50]:
df_merge_all.to_csv('wrangled_data.csv',index=False)