In [141]:
# There are many warnings regarding the updates in the future releases of the libraries. Ignore them.
import warnings
warnings.filterwarnings("ignore")

#Education.xls file is the first data set
import pandas as pd
file_name='All excel sheets\Education.xls'
xl = pd.ExcelFile(file_name)
print(xl.sheet_names)

['Education 1970 to 2016']


In [142]:
# The first three lines are comments
df= xl.parse(0,header=4)

#Columns 1, 2, and 5 are state, area name(US, State, County), and the rural-urban continuum codes, respectively
df1=df.iloc[:,[1,2,5]]

# The last four columns are the percentages of adults with less than high school diploma, high school diploma, some college
# or associate's degree, and Bachelor's or higher degrees
df2=df.iloc[:,-4:]

#Concat df1 & df2
df_concat=pd.concat([df1,df2],axis='columns')

# Remove rows associate with Puerto Rico because this project examines the 50 states and District of Columbia
df_concat = df_concat[df_concat.State != 'PR']

#print the first five rows
df_concat.head()

Unnamed: 0,State,Area name,2013 Rural-urban Continuum Code,"Percent of adults with less than a high school diploma, 2012-2016","Percent of adults with a high school diploma only, 2012-2016","Percent of adults completing some college or associate's degree, 2012-2016","Percent of adults with a bachelor's degree or higher, 2012-2016"
0,US,United States,,13.021,27.531,29.133,30.315
1,AL,Alabama,,15.209,30.956,29.825,24.01
2,AL,Autauga County,2.0,12.417,34.331,28.66,24.593
3,AL,Baldwin County,3.0,9.972,28.692,31.788,29.547
4,AL,Barbour County,6.0,26.236,34.927,25.969,12.868


In [143]:
# The rows associated with the entire United States as well as 50 states must be removed since this project is focusing on the counties
# First I create the list of the states
file_name = r'All excel sheets\us_states.csv'
states_df = pd.read_csv(file_name,header=None)
states = states_df.iloc[:,1].tolist()
print('number of states are ',len(states)) # to make sure the list contains all 50 states
remove_list = ['United States']+states
remove_list [:5]

number of states are  50


['United States', 'Alabama', 'Alaska', 'Arizona', 'Arkansas']

In [144]:
# Remove the rows for United States and 50 states
df_concat = df_concat[~df_concat['Area name'].isin(remove_list)]
df_concat.head()

Unnamed: 0,State,Area name,2013 Rural-urban Continuum Code,"Percent of adults with less than a high school diploma, 2012-2016","Percent of adults with a high school diploma only, 2012-2016","Percent of adults completing some college or associate's degree, 2012-2016","Percent of adults with a bachelor's degree or higher, 2012-2016"
2,AL,Autauga County,2.0,12.417,34.331,28.66,24.593
3,AL,Baldwin County,3.0,9.972,28.692,31.788,29.547
4,AL,Barbour County,6.0,26.236,34.927,25.969,12.868
5,AL,Bibb County,1.0,19.302,41.816,26.883,12.0
6,AL,Blount County,1.0,19.969,32.942,34.039,13.05


In [145]:
#Print the dataframe information
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3154 entries, 2 to 3203
Data columns (total 7 columns):
State                                                                         3154 non-null object
Area name                                                                     3154 non-null object
2013 Rural-urban Continuum Code                                               3143 non-null float64
Percent of adults with less than a high school diploma, 2012-2016             3144 non-null float64
Percent of adults with a high school diploma only, 2012-2016                  3144 non-null float64
Percent of adults completing some college or associate's degree, 2012-2016    3144 non-null float64
Percent of adults with a bachelor's degree or higher, 2012-2016               3144 non-null float64
dtypes: float64(5), object(2)
memory usage: 197.1+ KB


In [146]:
# Print all the rows with some null element and investigate them
df_concat[df_concat.isnull().any(axis=1)]

Unnamed: 0,State,Area name,2013 Rural-urban Continuum Code,"Percent of adults with less than a high school diploma, 2012-2016","Percent of adults with a high school diploma only, 2012-2016","Percent of adults completing some college or associate's degree, 2012-2016","Percent of adults with a bachelor's degree or higher, 2012-2016"
70,AK,Aleutian Islands,,,,,
86,AK,Kuskokwim Division,,,,,
94,AK,Prince of Wales-Outer Ketchikan Census Area,,,,,
97,AK,Skagway-Yakutat-Angoon Census Area,,,,,
98,AK,Skagway-Hoonah-Angoon Census Area,,,,,
100,AK,Upper Yukon Division,,,,,
103,AK,Wrangell-Petersburg Census Area,,,,,
335,DC,District of Columbia,,10.045,18.006,16.588,55.36
1139,LA,Lousiana,,16.184,33.875,26.983,22.958
1689,MT,Yellowstone National Park,,,,,


In [147]:
# One row is about Louisiana which was not removed when states were filter out because of the miss spelling. It must be removed
# There are 10 counties with no information on education. They must be removed since they will be of no use in the project
# District of columbia is missing only rural_urban continuum code.Since District of Columbia is both a district and a county,
# I need to check if there is more than one row for it
df_concat[df['State']=='DC']

Unnamed: 0,State,Area name,2013 Rural-urban Continuum Code,"Percent of adults with less than a high school diploma, 2012-2016","Percent of adults with a high school diploma only, 2012-2016","Percent of adults completing some college or associate's degree, 2012-2016","Percent of adults with a bachelor's degree or higher, 2012-2016"
335,DC,District of Columbia,,10.045,18.006,16.588,55.36
336,DC,District of Columbia,1.0,10.045,18.006,16.588,55.36


In [148]:
# There are two rows for District of Columbia. Therefore, the one with missing rural_urban continuum code must be removed
# along with all other rows with some missing element
df_education=df_concat.dropna()

# Print the info of the dataframe to find the total number of rows
df_education.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 2 to 3203
Data columns (total 7 columns):
State                                                                         3142 non-null object
Area name                                                                     3142 non-null object
2013 Rural-urban Continuum Code                                               3142 non-null float64
Percent of adults with less than a high school diploma, 2012-2016             3142 non-null float64
Percent of adults with a high school diploma only, 2012-2016                  3142 non-null float64
Percent of adults completing some college or associate's degree, 2012-2016    3142 non-null float64
Percent of adults with a bachelor's degree or higher, 2012-2016               3142 non-null float64
dtypes: float64(5), object(2)
memory usage: 196.4+ KB


In [149]:
# Columns are renamed for convenience
df_education=df_education.rename(columns={'State':'state','Area name':'county','2013 Rural-urban Continuum Code':'RUCC',
                             'Percent of adults with less than a high school diploma, 2012-2016':'less_than_high_school',
                            'Percent of adults with a high school diploma only, 2012-2016':'high_school_diploma',
                            'Percent of adults completing some college or associate\'s degree, 2012-2016':'college/associate_degree',
                            'Percent of adults with a bachelor\'s degree or higher, 2012-2016':'bachelors/higher'})

#print the first five rows
df_education.head()

Unnamed: 0,state,county,RUCC,less_than_high_school,high_school_diploma,college/associate_degree,bachelors/higher
2,AL,Autauga County,2.0,12.417,34.331,28.66,24.593
3,AL,Baldwin County,3.0,9.972,28.692,31.788,29.547
4,AL,Barbour County,6.0,26.236,34.927,25.969,12.868
5,AL,Bibb County,1.0,19.302,41.816,26.883,12.0
6,AL,Blount County,1.0,19.969,32.942,34.039,13.05


In [150]:
# The rural-urban continuum codes (RUCC) must be integers from 1-9. All the unique values are printed to make sure 
# they meet the expectation
print(' All rural-urban continuum code values are\n',df_education.RUCC.unique())

# check if any education percentage is less than 0 or more than 100
print('\nNumber of rows with at least one education percentage less than 0 or more than 100 is ',\
      sum([df_education[(df_education[column]>100) | (df_education[column]<0)].shape[0] for column in \
           df_education.iloc[:,-4:].columns]))

# check if the sum of the education percentages is close enough to 100 for all rows
total=df_education.iloc[:,-4:].sum(axis=1)
print('\nNumber of rows with the sum of education percentages less than 99.9 is ',\
      total[total<99.9].shape[0])

# check if there is more than one row for any county
state_county=df_education.state+'-'+df_education.county
print('\nTop five counties with the most counts')
print(state_county.value_counts().head())
# If the top ones have only one count, it means there is no duplicate row for any county

 All rural-urban continuum code values are
 [ 2.  3.  6.  1.  9.  7.  8.  4.  5.]

Number of rows with at least one education percentage less than 0 or more than 100 is  0

Number of rows with the sum of education percentages less than 99.9 is  0

Top five counties with the most counts
FL-Gadsden County    1
IN-Clark County      1
WI-Vilas County      1
TN-Marion County     1
MT-Wibaux County     1
dtype: int64


In [151]:
# read the poverty file
file_name='All excel sheets\PovertyEstimates.xls'
xl = pd.ExcelFile(file_name)
print(xl.sheet_names)

['Poverty Data 2016', 'Variable Descriptions']


In [152]:
# read the first sheet, collect three columns and rename them
df_poverty=xl.parse(0,header=2)
df_poverty=df_poverty.loc[:,['State','Area_Name','PCTPOVALL_2016']].rename(columns={'State':'state','Area_Name':'county',
                                                                                    'PCTPOVALL_2016':'poverty'})
#Print the first five rows
df_poverty.head()

Unnamed: 0,state,county,poverty
0,US,United States,14.0
1,AL,Alabama,17.0
2,AL,Autauga County,14.0
3,AL,Baldwin County,12.0
4,AL,Barbour County,30.0


In [153]:
# Remove the rows for United States and 50 states
df_poverty = df_poverty[~df_poverty['county'].isin(remove_list)]

# Remove Puerto Rico from data set
df_poverty = df_poverty[df_poverty.state != 'PR']

#print the first five rows
df_poverty.head()

Unnamed: 0,state,county,poverty
2,AL,Autauga County,14.0
3,AL,Baldwin County,12.0
4,AL,Barbour County,30.0
5,AL,Bibb County,20.0
6,AL,Blount County,14.0


In [154]:
# Print the dataframe information
df_poverty.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3143 entries, 2 to 3193
Data columns (total 3 columns):
state      3143 non-null object
county     3143 non-null object
poverty    3142 non-null float64
dtypes: float64(1), object(2)
memory usage: 98.2+ KB


In [155]:
# print the row with null poverty
df_poverty[df_poverty.isnull().any(axis=1)]

Unnamed: 0,state,county,poverty
561,HI,Kalawao County,


In [156]:
# There is only one null cell in df_poverty, so we could just remove it
df_poverty.dropna(inplace=True)

# check if there is any poverty percentage less than 0 or more than 100
print('Number of rows with the poverty percentage less than 0 or more than 100 is ',\
      df_poverty[(df_poverty.poverty>100) | (df_poverty.poverty<0)].shape[0])

# check if there are duplicate rows for any county
state_county=df_poverty.state+'-'+df_poverty.county
print('\nTop five counties with the most counts')
print(state_county.value_counts().head())

Number of rows with the poverty percentage less than 0 or more than 100 is  0

Top five counties with the most counts
DC-District of Columbia    2
FL-Gadsden County          1
KY-Garrard County          1
TN-Marion County           1
MT-Wibaux County           1
dtype: int64


In [157]:
# print he duplicate rows
df_poverty[df_poverty.state=='DC']

Unnamed: 0,state,county,poverty
328,DC,District of Columbia,19.0
329,DC,District of Columbia,19.0


In [158]:
# Remove the duplicate row
df_poverty.drop_duplicates(inplace = True)

# print info of df_poverty again
df_poverty.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3141 entries, 2 to 3193
Data columns (total 3 columns):
state      3141 non-null object
county     3141 non-null object
poverty    3141 non-null float64
dtypes: float64(1), object(2)
memory usage: 98.2+ KB


In [159]:
# It shows there are two rows for District of Columbia. The duplicate is dropped
df_poverty.drop_duplicates(inplace=True)

# Merge education and poverty dataframes
df_merge_education_poverty=df_education.merge(df_poverty,on=['state','county'])

# Print the information of the new dataframe
df_merge_education_poverty.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3131 entries, 0 to 3130
Data columns (total 8 columns):
state                       3131 non-null object
county                      3131 non-null object
RUCC                        3131 non-null float64
less_than_high_school       3131 non-null float64
high_school_diploma         3131 non-null float64
college/associate_degree    3131 non-null float64
bachelors/higher            3131 non-null float64
poverty                     3131 non-null float64
dtypes: float64(6), object(2)
memory usage: 220.1+ KB


In [160]:
# df_poverty has 3141 rows. df_education has 3142 rows. I need to investigate the differences between the two dataframes 
# which causes 10 loss of rows in the merge to make sure it is not because of misspelling or some space

# Make two lists from state-county of the two dataframes
education = (df_education['state']+'-'+df_education['county']).tolist()
poverty = (df_poverty['state']+'-'+df_poverty['county']).tolist()

# Rows which exist in education dataframe but do not exist in poverty dataframe
print('Rows which exist in education dataframe but do not exist in poverty dataframe: \n')
print(set(education)-set(poverty))

# Rows which exist in poverty dataframe but do not exist in education dataframe
print('\nRows which exist in poverty dataframe but do not exist in education dataframe: \n')
print(set(poverty)-set(education))

Rows which exist in education dataframe but do not exist in poverty dataframe: 

{'AK-Anchorage Municipality', 'NM-De Baca County', 'IN-LaGrange County', 'HI-Kalawao County', 'AK-Juneau City and Borough', 'AK-Sitka City and Borough', 'AK-Yakutat City and Borough', 'IN-DeKalb County', 'PA-McKean County', 'AK-Petersburg Census Area', 'IN-LaPorte County'}

Rows which exist in poverty dataframe but do not exist in education dataframe: 

{'IN-Lagrange County', 'AK-Anchorage Borough', 'AK-Sitka Borough', 'IN-De Kalb County', 'AK-Yakutat Borough', 'AK-Petersburg Borough', 'NM-DeBaca County', 'IN-La Porte County', 'AK-Juneau Borough', 'PA-Mc Kean County'}


In [161]:
# As it can be seen, the only real difference is 'HI-Kalawao County' (This county had missing poverty rate). The rest are the
# same except some minor difference in name or the space. I fix the names and rebuild df_merge_education_poverty

# The differences between the two lists for McKean, DeKalb, LaPorte, LaGrange, and DeBaca do not follow any specific pattern.
# For example, there is no space in McKean, DeKalb, LaPorte in education dataframe, but there is no space in DeBaca in poverty
# dataframe. So I just rename these five counties in poverty dataframe manually
df_poverty.loc[(df_poverty.state=='PA') & (df_poverty.county=='Mc Kean County'),'county']='McKean County'
df_poverty.loc[(df_poverty.state=='IN') & (df_poverty.county=='De Kalb County'),'county']='DeKalb County'
df_poverty.loc[(df_poverty.state=='IN') & (df_poverty.county=='La Porte County'),'county']='LaPorte County'
df_poverty.loc[(df_poverty.state=='IN') & (df_poverty.county=='Lagrange County'),'county']='LaGrange County'
df_poverty.loc[(df_poverty.state=='NM') & (df_poverty.county=='DeBaca County'),'county']='De Baca County'

# The differences between the two lists for the five counties in Alaska is following a special pattern. All the names in poverty
# dataframe end with Borough but in education dataframe they end with something different

df_education['state-county']=df_education['state']+'-'+df_education['county']
df_education.loc[(~df_education['state-county'].isin(poverty)) & (df_education['state']=='AK'),'county'] = \
df_education.loc[(~df_education['state-county'].isin(poverty)) & (df_education['state']=='AK'),'county']\
.map(lambda x:x[:x.find(' ')+1]+'Borough')

# Drop the added state-county column
df_education.drop('state-county',axis=1,inplace=True)

# Merge education and poverty dataframes
df_merge_education_poverty=df_education.merge(df_poverty,on=['state','county'])

# Print the information of the new dataframe
df_merge_education_poverty.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3141 entries, 0 to 3140
Data columns (total 8 columns):
state                       3141 non-null object
county                      3141 non-null object
RUCC                        3141 non-null float64
less_than_high_school       3141 non-null float64
high_school_diploma         3141 non-null float64
college/associate_degree    3141 non-null float64
bachelors/higher            3141 non-null float64
poverty                     3141 non-null float64
dtypes: float64(6), object(2)
memory usage: 220.9+ KB


In [162]:
# read the Unemployment sheet
file_name=r'All excel sheets\Unemployment.xls'
xl=pd.ExcelFile(file_name)
print(xl.sheet_names)

['Unemployment Med HH Inc', 'Variable Descriptions']


In [163]:
# read the first sheet, collect three columns and rename them
df_unemployment=xl.parse(0)
df_unemployment=df_unemployment[['State','Area_name','Unemployment_rate_2016']].rename(columns={'State':'state','Area_name':'county',
                                                                                       'Unemployment_rate_2016'
                                                                                      :'unemployment'})
#Print the first five rows
df_unemployment.head()

Unnamed: 0,state,county,unemployment
0,AL,Alabama,6.0
1,AL,"Autauga County, AL",5.3
2,AL,"Baldwin County, AL",5.4
3,AL,"Barbour County, AL",8.6
4,AL,"Bibb County, AL",6.6


In [164]:
# Remove states
df_unemployment = df_unemployment[~df_unemployment.county.isin(remove_list)]

# Remove Puerto Rico from data set
df_unemployment = df_unemployment[df_unemployment.state != 'PR']

#Remove the state name from county names
df_unemployment.county=df_unemployment.county.map(lambda x: x[:len(x)-4])

# The command above will remove the last four characters from name of states and District of Columbia.
# We need District of Columbia, and it must be corrected.
df_unemployment.loc[df_unemployment['county']=='District of Colu','county']='District of Columbia'

#Print the first five rows
df_unemployment.head()

Unnamed: 0,state,county,unemployment
1,AL,Autauga County,5.3
2,AL,Baldwin County,5.4
3,AL,Barbour County,8.6
4,AL,Bibb County,6.6
5,AL,Blount County,5.5


In [165]:
#Print information
df_unemployment.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3145 entries, 1 to 3194
Data columns (total 3 columns):
state           3145 non-null object
county          3145 non-null object
unemployment    3142 non-null float64
dtypes: float64(1), object(2)
memory usage: 98.3+ KB


In [166]:
# print the rows with null element
df_unemployment[df_unemployment.isnull().any(axis=1)]

Unnamed: 0,state,county,unemployment
91,AK,Prince of Wales-Outer Ketchikan Census Area,
94,AK,Skagway-Hoonah-Angoon Census Area,
98,AK,Wrangell-Petersburg Census Area,


In [167]:
# The three rows do not have unemployment rate and could be removed
df_unemployment.dropna(inplace=True)

#Check if there is any duplicate
state_county=df_unemployment.state+'-'+df_unemployment.county
print('\nTop five counties with most counts')
print(state_county.value_counts().head())


Top five counties with most counts
DC-District of Columbia    2
FL-Gadsden County          1
GA-Thomas County           1
MT-Wibaux County           1
MI-Ogemaw County           1
dtype: int64


In [168]:
#Remove the duplicate
df_unemployment.drop_duplicates(inplace=True)

# Print the info of df_unemployment
df_unemployment.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3141 entries, 1 to 3194
Data columns (total 3 columns):
state           3141 non-null object
county          3141 non-null object
unemployment    3141 non-null float64
dtypes: float64(1), object(2)
memory usage: 98.2+ KB


In [169]:
#Merge all three tables together
df_merge_three=df_merge_education_poverty.merge(df_unemployment,on=['state','county'])

#Print the info of the dataframe to make sure no data is missing
df_merge_three.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3128 entries, 0 to 3127
Data columns (total 9 columns):
state                       3128 non-null object
county                      3128 non-null object
RUCC                        3128 non-null float64
less_than_high_school       3128 non-null float64
high_school_diploma         3128 non-null float64
college/associate_degree    3128 non-null float64
bachelors/higher            3128 non-null float64
poverty                     3128 non-null float64
unemployment                3128 non-null float64
dtypes: float64(7), object(2)
memory usage: 244.4+ KB


In [170]:
# df_merge_education_poverty has 3141 rows. df_unemployment has 3141 rows, either. I need to investigate 
# the differences between the two dataframes which causes 13 loss of rows in the merge to make sure it is not because of 
# misspelling or some space
merge_1 = (df_merge_education_poverty['state']+'-'+df_merge_education_poverty['county']).tolist()
unemployment = (df_unemployment['state']+'-'+df_unemployment['county']).tolist()

# Rows which exist in education dataframe but do not exist in poverty dataframe
print('Rows which exist in the education-poverty merged dataframe but do not exist in unemployment dataframe: \n')
print(set(merge_1)-set(unemployment))

# Rows which exist in poverty dataframe but do not exist in education dataframe
print('\nRows which exist in unemployment dataframe but do not exist in education-poverty merged dataframe: \n')
print(set(unemployment)-set(merge_1))

Rows which exist in the education-poverty merged dataframe but do not exist in unemployment dataframe: 

{'AK-Anchorage Borough', 'CO-Denver County', 'IL-La Salle County', 'AK-Sitka Borough', 'LA-La Salle Parish', 'CO-Broomfield County', 'PA-Philadelphia County', 'AK-Yakutat Borough', 'MA-Nantucket County', 'HI-Honolulu County', 'AK-Wrangell City and Borough', 'AK-Juneau Borough', 'CA-San Francisco County'}

Rows which exist in unemployment dataframe but do not exist in education-poverty merged dataframe: 

{'LA-LaSalle Parish', 'HI-Honolulu County/city', 'PA-Philadelphia County/city', 'AK-Sitka Borough/city', 'AK-Juneau Borough/city', 'AK-Anchorage Borough/municipality', 'CO-Denver County/city', 'CO-Broomfield County/city', 'MA-Nantucket County/town', 'AK-Wrangell Borough/city', 'AK-Yakutat Borough/city', 'IL-LaSalle County', 'CA-San Francisco County/city'}


In [171]:
# First fix LA-LaSalle Parish, IL-LaSalle, and  AK-Wrangell Borough counties
df_unemployment.loc[(df_unemployment.state=='LA') & (df_unemployment.county=='LaSalle Parish'),'county']='La Salle Parish'
df_unemployment.loc[(df_unemployment.state=='IL') & (df_unemployment.county=='LaSalle County'),'county']='La Salle County'
df_merge_education_poverty.loc[(df_merge_education_poverty.state=='AK') & (df_merge_education_poverty.county==\
                                                                           'Wrangell City and Borough'),'county']='Wrangell Borough'

# Then we need to remove '/' and whatever is after it from the county name of the rest 
df_unemployment['state-county']=df_unemployment['state']+'-'+df_unemployment['county']
df_unemployment.loc[~df_unemployment['state-county'].isin(merge_1),'county']=\
df_unemployment.loc[~df_unemployment['state-county'].isin(merge_1),'county'].map(lambda x: x[:x.find('/')])

# remove the 'state-county' column created above
df_unemployment.drop('state-county',axis=1,inplace=True)

#Merge all three tables together again
df_merge_three=df_merge_education_poverty.merge(df_unemployment,on=['state','county'])

#Print the info of the dataframe to make sure no data is missing
df_merge_three.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3141 entries, 0 to 3140
Data columns (total 9 columns):
state                       3141 non-null object
county                      3141 non-null object
RUCC                        3141 non-null float64
less_than_high_school       3141 non-null float64
high_school_diploma         3141 non-null float64
college/associate_degree    3141 non-null float64
bachelors/higher            3141 non-null float64
poverty                     3141 non-null float64
unemployment                3141 non-null float64
dtypes: float64(7), object(2)
memory usage: 245.4+ KB


In [172]:
# read the states and regions sheet
file_name=r'All excel sheets\us census bureau regions and divisions.csv'
df_regions=pd.read_csv(file_name)

#Print the first five rows
df_regions.head()

Unnamed: 0,State,State Code,Region,Division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


In [173]:
# Rename the State Code column to state
df_regions.rename(columns={'State Code':'state','Region':'region'},inplace=True)

# Add region column to df_merge_three
df_merge_all = df_merge_three.merge(df_regions.loc[:,['state','region']])

# Rearrange the columns so the region column is located after the county name
cols = df_merge_all.columns.tolist()
cols=cols[0:2]+cols[-1:]+cols[2:9]
df_merge_all = df_merge_all[cols]

# Print the first five rows
df_merge_all.head()

Unnamed: 0,state,county,region,RUCC,less_than_high_school,high_school_diploma,college/associate_degree,bachelors/higher,poverty,unemployment
0,AL,Autauga County,South,2.0,12.417,34.331,28.66,24.593,14.0,5.3
1,AL,Baldwin County,South,3.0,9.972,28.692,31.788,29.547,12.0,5.4
2,AL,Barbour County,South,6.0,26.236,34.927,25.969,12.868,30.0,8.6
3,AL,Bibb County,South,1.0,19.302,41.816,26.883,12.0,20.0,6.6
4,AL,Blount County,South,1.0,19.969,32.942,34.039,13.05,14.0,5.5


In [174]:
#print the information of df_merge_all to make sure everythin looks fine
df_merge_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3141 entries, 0 to 3140
Data columns (total 10 columns):
state                       3141 non-null object
county                      3141 non-null object
region                      3141 non-null object
RUCC                        3141 non-null float64
less_than_high_school       3141 non-null float64
high_school_diploma         3141 non-null float64
college/associate_degree    3141 non-null float64
bachelors/higher            3141 non-null float64
poverty                     3141 non-null float64
unemployment                3141 non-null float64
dtypes: float64(7), object(3)
memory usage: 269.9+ KB


In [175]:
# Save the wrangled data to a csv file
df_merge_all.to_csv('wrangled_data.csv',index=False)