## Data Cleaning

#### Import modules

In [1]:
import os
import pandas as pd

#### Define paths and load data

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
PATH = '../raw_data'

In [90]:
OUTPUT_PATH = '../outputs'

### Election results

#### Load data

In [5]:
results = pd.read_csv(os.path.join(PATH, 'countypres_2000-2020.csv'))
results = results[results['party'].isin(['DEMOCRAT', 'REPUBLICAN'])]

In [6]:
results['pctvotes'] = results['candidatevotes'] / results['totalvotes']
results['office'] = 'PRESIDENT'

#### Pivot table

In [51]:
index_cols = ['year', 'state', 'state_po', 'county_name', 
              'county_fips', 'office']

In [56]:
pivoted_results = pd.pivot_table(results, index=index_cols, values=['pctvotes', 'candidatevotes'], columns=['party']).reset_index()
pivoted_results.columns.name = ''
pivoted_results.columns = [(x[0] + '_' + x[1]).lower() if x[1] != '' else x[0] for x in  pivoted_results.columns.values]
a = pivoted_results['candidatevotes_democrat'] > pivoted_results['candidatevotes_republican']
pivoted_results['winner'] = ['democrat' if x else 'republican' for x in a]
pivoted_results = pivoted_results[['year', 'state', 'county_name', 'county_fips', 'candidatevotes_democrat',
                                   'candidatevotes_republican', 'pctvotes_democrat', 'pctvotes_republican', 'winner']]

In [57]:
pivoted_results.head()

Unnamed: 0,year,state,county_name,county_fips,candidatevotes_democrat,candidatevotes_republican,pctvotes_democrat,pctvotes_republican,winner
0,2000,ALABAMA,AUTAUGA,1001.0,4942.0,11993.0,0.287192,0.696943,republican
1,2000,ALABAMA,BALDWIN,1003.0,13997.0,40872.0,0.247822,0.723654,republican
2,2000,ALABAMA,BARBOUR,1005.0,5188.0,5096.0,0.499086,0.490236,democrat
3,2000,ALABAMA,BIBB,1007.0,2710.0,4273.0,0.381636,0.601746,republican
4,2000,ALABAMA,BLOUNT,1009.0,4977.0,12667.0,0.276915,0.704779,republican


In [89]:
pivoted_results.shape

(18920, 9)

In [125]:
results[results['county_fips'] == 13123]

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode,pctvotes
1668,2000,GEORGIA,GA,GILMER,13123.0,PRESIDENT,AL GORE,DEMOCRAT,2230.0,7370.0,20191203,TOTAL,0.302578
1669,2000,GEORGIA,GA,GILMER,13123.0,PRESIDENT,GEORGE W. BUSH,REPUBLICAN,4941.0,7370.0,20191203,TOTAL,0.670421
13718,2004,GEORGIA,GA,GILMER,13123.0,PRESIDENT,JOHN KERRY,DEMOCRAT,2510.0,9998.0,20191203,TOTAL,0.25105
13719,2004,GEORGIA,GA,GILMER,13123.0,PRESIDENT,GEORGE W. BUSH,REPUBLICAN,7414.0,9998.0,20191203,TOTAL,0.741548
23069,2008,GEORGIA,GA,GILMER,13123.0,PRESIDENT,BARACK OBAMA,DEMOCRAT,2614.0,11149.0,20191203,TOTAL,0.23446
23070,2008,GEORGIA,GA,GILMER,13123.0,PRESIDENT,JOHN MCCAIN,REPUBLICAN,8408.0,11149.0,20191203,TOTAL,0.754148
32420,2012,GEORGIA,GA,GILMER,13123.0,PRESIDENT,BARACK OBAMA,DEMOCRAT,1958.0,11018.0,20191203,TOTAL,0.177709
32421,2012,GEORGIA,GA,GILMER,13123.0,PRESIDENT,MITT ROMNEY,REPUBLICAN,8926.0,11018.0,20191203,TOTAL,0.810129
41771,2016,GEORGIA,GA,GILMER,13123.0,PRESIDENT,HILLARY CLINTON,DEMOCRAT,1965.0,12773.0,20191203,TOTAL,0.15384
41772,2016,GEORGIA,GA,GILMER,13123.0,PRESIDENT,DONALD TRUMP,REPUBLICAN,10477.0,12773.0,20191203,TOTAL,0.820246


Export election results

In [91]:
for year in pivoted_results['year'].unique():
    
    df_temp = pivoted_results[pivoted_results['year'] == year]
    df_temp.to_csv(os.path.join(OUTPUT_PATH, f'results_{year}.csv'), index=False)

### Unemployment data

In [58]:
unemployment = pd.read_excel(os.path.join(PATH, 'Unemployment.xlsx'), header=4)

In [59]:
rates_cols = ['Unemployment_rate_2000',
              'Unemployment_rate_2004',
              'Unemployment_rate_2008',
              'Unemployment_rate_2012',
              'Unemployment_rate_2016',
              'Unemployment_rate_2020']

In [60]:
area_cols = ['State', 'Area_name', 'FIPS_Code']

In [61]:
unemployment_ey = unemployment[area_cols + rates_cols]

In [62]:
unemployment_ey.sample()

Unnamed: 0,State,Area_name,FIPS_Code,Unemployment_rate_2000,Unemployment_rate_2004,Unemployment_rate_2008,Unemployment_rate_2012,Unemployment_rate_2016,Unemployment_rate_2020
872,IA,"Mills County, IA",19129,2.0,4.2,3.6,4.6,3.6,3.9


In [63]:
unemployment_ey[unemployment_ey['FIPS_Code'] == 1007]

Unnamed: 0,State,Area_name,FIPS_Code,Unemployment_rate_2000,Unemployment_rate_2004,Unemployment_rate_2008,Unemployment_rate_2012,Unemployment_rate_2016,Unemployment_rate_2020
5,AL,"Bibb County, AL",1007,5.4,5.4,6.0,8.8,6.5,6.6


In [87]:
unemployment.sample(2)

Unnamed: 0,FIPS_Code,State,Area_name,Rural_urban_continuum_code_2013,Urban_influence_code_2013,Metro_2013,Civilian_labor_force_2000,Employed_2000,Unemployed_2000,Unemployment_rate_2000,Civilian_labor_force_2001,Employed_2001,Unemployed_2001,Unemployment_rate_2001,Civilian_labor_force_2002,Employed_2002,Unemployed_2002,Unemployment_rate_2002,Civilian_labor_force_2003,Employed_2003,Unemployed_2003,Unemployment_rate_2003,Civilian_labor_force_2004,Employed_2004,Unemployed_2004,Unemployment_rate_2004,Civilian_labor_force_2005,Employed_2005,Unemployed_2005,Unemployment_rate_2005,Civilian_labor_force_2006,Employed_2006,Unemployed_2006,Unemployment_rate_2006,Civilian_labor_force_2007,Employed_2007,Unemployed_2007,Unemployment_rate_2007,Civilian_labor_force_2008,Employed_2008,Unemployed_2008,Unemployment_rate_2008,Civilian_labor_force_2009,Employed_2009,Unemployed_2009,Unemployment_rate_2009,Civilian_labor_force_2010,Employed_2010,Unemployed_2010,Unemployment_rate_2010,Civilian_labor_force_2011,Employed_2011,Unemployed_2011,Unemployment_rate_2011,Civilian_labor_force_2012,Employed_2012,Unemployed_2012,Unemployment_rate_2012,Civilian_labor_force_2013,Employed_2013,Unemployed_2013,Unemployment_rate_2013,Civilian_labor_force_2014,Employed_2014,Unemployed_2014,Unemployment_rate_2014,Civilian_labor_force_2015,Employed_2015,Unemployed_2015,Unemployment_rate_2015,Civilian_labor_force_2016,Employed_2016,Unemployed_2016,Unemployment_rate_2016,Civilian_labor_force_2017,Employed_2017,Unemployed_2017,Unemployment_rate_2017,Civilian_labor_force_2018,Employed_2018,Unemployed_2018,Unemployment_rate_2018,Civilian_labor_force_2019,Employed_2019,Unemployed_2019,Unemployment_rate_2019,Civilian_labor_force_2020,Employed_2020,Unemployed_2020,Unemployment_rate_2020,Median_Household_Income_2019,Med_HH_Income_Percent_of_State_Total_2019
2465,46119,SD,"Sully County, SD",9.0,8.0,0.0,913.0,892.0,21.0,2.3,907.0,885.0,22.0,2.4,975.0,955.0,20.0,2.1,1018.0,996.0,22.0,2.2,1018.0,994.0,24.0,2.4,1016.0,988.0,28.0,2.8,1013.0,990.0,23.0,2.3,978.0,956.0,22.0,2.2,985.0,963.0,22.0,2.2,992.0,964.0,28.0,2.8,891.0,859.0,32.0,3.6,884.0,851.0,33.0,3.7,890.0,864.0,26.0,2.9,888.0,863.0,25.0,2.8,891.0,867.0,24.0,2.7,876.0,854.0,22.0,2.5,863.0,843.0,20.0,2.3,836.0,811.0,25.0,3.0,806.0,786.0,20.0,2.5,799.0,778.0,21.0,2.6,789.0,767.0,22.0,2.8,61791.0,102.279274
638,17053,IL,"Ford County, IL",3.0,2.0,1.0,7342.0,7079.0,263.0,3.6,7342.0,7017.0,325.0,4.4,7223.0,6894.0,329.0,4.6,7158.0,6787.0,371.0,5.2,7207.0,6840.0,367.0,5.1,7306.0,6961.0,345.0,4.7,7378.0,7058.0,320.0,4.3,7321.0,6954.0,367.0,5.0,7170.0,6725.0,445.0,6.2,7080.0,6366.0,714.0,10.1,7279.0,6585.0,694.0,9.5,7129.0,6519.0,610.0,8.6,6957.0,6437.0,520.0,7.5,6726.0,6232.0,494.0,7.3,6640.0,6246.0,394.0,5.9,6662.0,6309.0,353.0,5.3,6683.0,6319.0,364.0,5.4,6472.0,6161.0,311.0,4.8,6486.0,6184.0,302.0,4.7,6391.0,6128.0,263.0,4.1,6236.0,5834.0,402.0,6.4,55369.0,79.99913


In [72]:
unemployment_ey_melted = pd.melt(unemployment_ey, 
                                 id_vars=area_cols, 
                                 value_vars=rates_cols, 
                                 value_name='unemployment',
                                 var_name='year')
unemployment_ey_melted['year'] = [x.split('_')[2] for x in unemployment_ey_melted['year']]
unemployment_ey_melted['year'] = unemployment_ey_melted['year'].astype(int)
unemployment_ey_melted.rename(columns={'FIPS_Code': 'county_fips'}, inplace=True)
unemployment_ey_melted = unemployment_ey_melted[['county_fips', 'year', 'unemployment']]

In [73]:
unemployment_ey_melted.sample()

Unnamed: 0,county_fips,year,unemployment
15544,46077,2016,2.7


In [92]:
for year in unemployment_ey_melted['year'].unique():
    
    df_temp = unemployment_ey_melted[unemployment_ey_melted['year'] == year]
    df_temp.to_csv(os.path.join(OUTPUT_PATH, f'unemployment_{year}.csv'), index=False)

#### Population

In [85]:
population = pd.read_excel(os.path.join(PATH, 'PopulationEstimates.xls'), header=2)

In [106]:
population.sample()

Unnamed: 0,FIPStxt,State,Area_Name,Rural-urban_Continuum Code_2003,Rural-urban_Continuum Code_2013,Urban_Influence_Code_2003,Urban_Influence_Code_2013,Economic_typology_2015,CENSUS_2010_POP,ESTIMATES_BASE_2010,POP_ESTIMATE_2010,POP_ESTIMATE_2011,POP_ESTIMATE_2012,POP_ESTIMATE_2013,POP_ESTIMATE_2014,POP_ESTIMATE_2015,POP_ESTIMATE_2016,POP_ESTIMATE_2017,POP_ESTIMATE_2018,POP_ESTIMATE_2019,N_POP_CHG_2010,N_POP_CHG_2011,N_POP_CHG_2012,N_POP_CHG_2013,N_POP_CHG_2014,N_POP_CHG_2015,N_POP_CHG_2016,N_POP_CHG_2017,N_POP_CHG_2018,N_POP_CHG_2019,Births_2010,Births_2011,Births_2012,Births_2013,Births_2014,Births_2015,Births_2016,Births_2017,Births_2018,Births_2019,Deaths_2010,Deaths_2011,Deaths_2012,Deaths_2013,Deaths_2014,Deaths_2015,Deaths_2016,Deaths_2017,Deaths_2018,Deaths_2019,NATURAL_INC_2010,NATURAL_INC_2011,NATURAL_INC_2012,NATURAL_INC_2013,NATURAL_INC_2014,NATURAL_INC_2015,NATURAL_INC_2016,NATURAL_INC_2017,NATURAL_INC_2018,NATURAL_INC_2019,INTERNATIONAL_MIG_2010,INTERNATIONAL_MIG_2011,INTERNATIONAL_MIG_2012,INTERNATIONAL_MIG_2013,INTERNATIONAL_MIG_2014,INTERNATIONAL_MIG_2015,INTERNATIONAL_MIG_2016,INTERNATIONAL_MIG_2017,INTERNATIONAL_MIG_2018,INTERNATIONAL_MIG_2019,DOMESTIC_MIG_2010,DOMESTIC_MIG_2011,DOMESTIC_MIG_2012,DOMESTIC_MIG_2013,DOMESTIC_MIG_2014,DOMESTIC_MIG_2015,DOMESTIC_MIG_2016,DOMESTIC_MIG_2017,DOMESTIC_MIG_2018,DOMESTIC_MIG_2019,NET_MIG_2010,NET_MIG_2011,NET_MIG_2012,NET_MIG_2013,NET_MIG_2014,NET_MIG_2015,NET_MIG_2016,NET_MIG_2017,NET_MIG_2018,NET_MIG_2019,RESIDUAL_2010,RESIDUAL_2011,RESIDUAL_2012,RESIDUAL_2013,RESIDUAL_2014,RESIDUAL_2015,RESIDUAL_2016,RESIDUAL_2017,RESIDUAL_2018,RESIDUAL_2019,GQ_ESTIMATES_BASE_2010,GQ_ESTIMATES_2010,GQ_ESTIMATES_2011,GQ_ESTIMATES_2012,GQ_ESTIMATES_2013,GQ_ESTIMATES_2014,GQ_ESTIMATES_2015,GQ_ESTIMATES_2016,GQ_ESTIMATES_2017,GQ_ESTIMATES_2018,GQ_ESTIMATES_2019,R_birth_2011,R_birth_2012,R_birth_2013,R_birth_2014,R_birth_2015,R_birth_2016,R_birth_2017,R_birth_2018,R_birth_2019,R_death_2011,R_death_2012,R_death_2013,R_death_2014,R_death_2015,R_death_2016,R_death_2017,R_death_2018,R_death_2019,R_NATURAL_INC_2011,R_NATURAL_INC_2012,R_NATURAL_INC_2013,R_NATURAL_INC_2014,R_NATURAL_INC_2015,R_NATURAL_INC_2016,R_NATURAL_INC_2017,R_NATURAL_INC_2018,R_NATURAL_INC_2019,R_INTERNATIONAL_MIG_2011,R_INTERNATIONAL_MIG_2012,R_INTERNATIONAL_MIG_2013,R_INTERNATIONAL_MIG_2014,R_INTERNATIONAL_MIG_2015,R_INTERNATIONAL_MIG_2016,R_INTERNATIONAL_MIG_2017,R_INTERNATIONAL_MIG_2018,R_INTERNATIONAL_MIG_2019,R_DOMESTIC_MIG_2011,R_DOMESTIC_MIG_2012,R_DOMESTIC_MIG_2013,R_DOMESTIC_MIG_2014,R_DOMESTIC_MIG_2015,R_DOMESTIC_MIG_2016,R_DOMESTIC_MIG_2017,R_DOMESTIC_MIG_2018,R_DOMESTIC_MIG_2019,R_NET_MIG_2011,R_NET_MIG_2012,R_NET_MIG_2013,R_NET_MIG_2014,R_NET_MIG_2015,R_NET_MIG_2016,R_NET_MIG_2017,R_NET_MIG_2018,R_NET_MIG_2019
703,17187,IL,Warren County,7.0,6.0,8.0,6.0,3.0,17707,17704,17713,17836,17749,17662,17712,17455,17294,17144,16992,16844,9.0,123.0,-87.0,-87.0,50.0,-257.0,-161.0,-150.0,-152.0,-148.0,54.0,194.0,224.0,211.0,218.0,214.0,231.0,207.0,213.0,204.0,30.0,180.0,192.0,181.0,188.0,190.0,189.0,165.0,189.0,178.0,24.0,14.0,32.0,30.0,30.0,24.0,42.0,42.0,24.0,26.0,3.0,28.0,49.0,45.0,49.0,47.0,42.0,42.0,36.0,37.0,-16.0,80.0,-168.0,-168.0,-25.0,-331.0,-246.0,-234.0,-212.0,-212.0,-13.0,108.0,-119.0,-123.0,24.0,-284.0,-204.0,-192.0,-176.0,-175.0,-2.0,1.0,0.0,6.0,-4.0,3.0,1.0,0.0,0.0,1.0,1116.0,1116.0,1116.0,1053.0,1010.0,1033.0,1053.0,973.0,975.0,977.0,977.0,10.914512,12.589574,11.917201,12.325437,12.170501,13.295347,12.021604,12.479494,12.058163,10.126867,10.791064,10.222812,10.629276,10.805585,10.878011,9.582438,11.073354,10.521338,0.787645,1.798511,1.694389,1.696161,1.364916,2.417336,2.439166,1.40614,1.536825,1.57529,2.753969,2.541583,2.770396,2.67296,2.417336,2.439166,2.10921,2.18702,4.50083,-9.442181,-9.488577,-1.413468,-18.824466,-14.158681,-13.589639,-12.420905,-12.531032,6.07612,-6.688211,-6.946994,1.356929,-16.151506,-11.741345,-11.150473,-10.311694,-10.344012


We obtained the rates for natural increase in population, international migration, domestic migration, and net migration for the years 2012, 2016, and we substituted the value of 2019 for 2020. 

In [103]:
years_population = ['2012', '2016', '2019']

In [105]:
rate_nat_inc_cols = ['R_NATURAL_INC_' + str(x) for x in years]
rate_int_mig_cols = ['R_INTERNATIONAL_MIG_' + str(x) for x in years]
rate_dom_mig_cols = ['R_DOMESTIC_MIG_' + str(x) for x in years]
rate_net_mig_cols = ['R_NET_MIG_' + str(x) for x in years]

In [108]:
test = population[['FIPStxt'] + rate_nat_inc_cols]

In [117]:
population_dict = {'rate_natural_increase_population': rate_nat_inc_cols,
                   'rate_international_migration': rate_int_mig_cols,
                   'rate_domestic_migration': rate_dom_mig_cols,
                   'rate_net_migration': rate_net_mig_cols}

In [None]:
for key in population_dict.keys():
    
    cols = population_dict[key]
    temp2 = pd.melt(population[['FIPStxt'] + cols], 
                    id_vars=['FIPStxt'], 
                    value_vars=cols, 
                    value_name=key,
                    var_name='year')
    temp2['year'] = [x.split("_")[3] for x in temp2['year']]
    temp2['year'] = ['2020' if x == '2019' else x for x in temp2['year']]
    
    for year in temp2['year']:
        df_temp = temp2[temp2['year'] == year]
        df_temp.to_csv(os.path.join(OUTPUT_PATH, f'unemployment_{year}.csv'), index=False)
        

In [115]:
natural_inc_year = pd.melt(population[['FIPStxt'] + rate_nat_inc_cols], 
                           id_vars=['FIPStxt'], 
                           value_vars=rate_nat_inc_cols, 
                           value_name='natural_increase_in_pop',
                           var_name='year')

natural_inc_year['year'] = [x.split("_")[3] for x in natural_inc_year['year']]
natural_inc_year['year'] = ['2020' if x == '2019' else x for x in natural_inc_year['year']]

In [116]:
natural_inc_year

Unnamed: 0,FIPStxt,year,natural_increase_in_pop
0,0,2012,
1,1000,2012,2.225976
2,1001,2012,0.998357
3,1003,2012,1.104318
4,1005,2012,0.513667
...,...,...,...
9814,72145,2020,
9815,72147,2020,
9816,72149,2020,
9817,72151,2020,


#### Results and unemployment merged