## Merging ACS and Voting data and cleaning

In [51]:
import pandas as pd
import numpy as np
import math
import re
from datetime import datetime
import time
from sklearn.preprocessing import StandardScaler

#### Model w/ one year lag ACS features

In [25]:
acs = pd.read_csv('ACS_Features_1218_FIXED.csv')

In [26]:
acs['year'] = acs.year.apply(str)

In [27]:
acs['year_merge'] = 0
acs.loc[acs["year"] == '2011', 'year_merge'] = 2012
acs.loc[acs["year"] == '2013', 'year_merge'] = 2014
acs.loc[acs["year"] == '2015', 'year_merge'] = 2016
acs.loc[acs["year"] == '2017', 'year_merge'] = 2018


In [29]:
acs['year_merge'] = acs.year_merge.apply(str)

In [32]:
acs['yr_district_id'] = acs[['year_merge', 'district_id']].apply(lambda x: '-'.join(x), axis=1)

In [44]:
acs.head(50)

Unnamed: 0,district_id,year,total_male,male_5_to_9_years,male_10_to_14_years,male_15_to_17_years,male_18_and_19_years,male_20_years,male_21_years,male_22_to_24_years,...,diff2_income_top_5_percent_scaled,L1_median_gross_rent_scaled,L2_median_gross_rent_scaled,diff1_median_gross_rent_scaled,diff2_median_gross_rent_scaled,L1_median_monthly_owner_costs_scaled,L2_median_monthly_owner_costs_scaled,diff1_median_monthly_owner_costs_scaled,diff2_median_monthly_owner_costs_scaled,year_merge
0,AK-00,2010,372436,26347,28085,16477,10063,6832,7412,15550,...,,,,,,,,,,0
1,AK-00,2011,372916,26470,26684,15011,10360,7715,7243,17620,...,,0.474385,,0.202538,,0.244528,,-0.00084,,2012
2,AK-00,2012,381094,27033,26425,15899,12008,5837,7258,19766,...,-0.05711,0.676923,0.474385,0.200075,0.402612,0.243687,0.244528,0.112811,0.111971,0
3,AK-00,2013,386404,26989,27265,15762,13616,7275,5951,20382,...,-0.085546,0.876998,0.676923,-0.141715,0.05836,0.356498,0.243687,-0.014218,0.098593,2014
4,AK-00,2014,385582,27517,25859,14719,11840,6167,8413,22074,...,-0.08829,0.735283,0.876998,0.094239,-0.047475,0.34228,0.356498,0.093562,0.079344,0
5,AK-00,2015,388695,26777,25914,15522,11287,7908,7159,21909,...,0.327192,0.829523,0.735283,-0.205865,-0.111625,0.435842,0.34228,-0.059753,0.033809,2016
6,AK-00,2016,390573,26386,29741,15986,10746,5080,6496,21626,...,-0.221718,0.623658,0.829523,0.016906,-0.188958,0.376089,0.435842,-0.022919,-0.082672,0
7,AK-00,2017,385776,27390,25234,15373,10731,5167,7294,17918,...,-0.484065,0.640565,0.623658,-0.171456,-0.154549,0.35317,0.376089,-0.024959,-0.047878,2018
8,AL-01,2010,338130,21512,27052,16930,10180,4320,5079,11654,...,-0.178022,0.469109,0.640565,-1.171429,-1.342885,0.328212,0.35317,-1.190241,-1.2152,0
9,AL-01,2011,334016,25040,22132,13880,10077,3200,4561,14014,...,-0.395903,-0.70232,0.469109,0.133987,-1.037442,-0.862029,0.328212,0.026307,-1.163934,2012


In [37]:
acs.to_csv('ACS_final_L1years_0814.csv')

#### Read in election history data, merge with `acs_cleaned_w_scaled_cols`

In [97]:
election_data = pd.read_csv('HouseVotesFeatures_12_18_0814PM.csv')
print(election_data.shape)
election_data.head()

(1740, 59)


Unnamed: 0.1,Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,...,dem_L2_voteshare,target,top_dem_votes,top_dem_candidate,top_rep_votes,top_rep_candidate,dem_incumbent_in_race,rep_incumbent_in_race,flipped,dmargin_45_55
0,7813,2012-AK-00,185296,289804,2012,Alaska,DonYoung,republican,False,AK-00,...,0.37744,0,82927.0,SharonMCissna,185296.0,DonYoung,0,1,0,0
1,8248,2014-AK-00,142572,279741,2014,Alaska,DonYoung,republican,False,AK-00,...,0.295641,0,114602.0,ForrestDunbar,142572.0,DonYoung,0,1,0,1
2,8683,2016-AK-00,155088,308198,2016,Alaska,DonYoung,republican,False,AK-00,...,0.34791,0,111019.0,SteveLindbeck,155088.0,DonYoung,0,1,0,1
3,9118,2018-AK-00,149779,282166,2018,Alaska,DonYoung,republican,False,AK-00,...,0.384946,0,131199.0,AlyseSGalvin,149779.0,DonYoung,0,1,0,1
4,7814,2012-AL-01,196374,200676,2012,Alabama,JoBonner,republican,False,AL-01,...,0.0,0,,,196374.0,JoBonner,0,1,0,0


In [98]:
election_data.drop(['Unnamed: 0'], axis=1, inplace=True)
election_data.shape

(1740, 58)

In [99]:
acs.shape

(3488, 624)

In [100]:
acs_election_merged = pd.merge(election_data,
                                 acs,
                                 how='left',
                                 on='yr_district_id'
                                )
acs_election_merged.shape

(1740, 681)

In [101]:
acs_election_merged.head()

Unnamed: 0,yr_district_id,winner_votes,totalvotes,year_x,state,winner,winner_party,writein,district_id_x,winner_voteshare,...,diff2_income_top_5_percent_scaled,L1_median_gross_rent_scaled,L2_median_gross_rent_scaled,diff1_median_gross_rent_scaled,diff2_median_gross_rent_scaled,L1_median_monthly_owner_costs_scaled,L2_median_monthly_owner_costs_scaled,diff1_median_monthly_owner_costs_scaled,diff2_median_monthly_owner_costs_scaled,year_merge
0,2012-AK-00,185296,289804,2012,Alaska,DonYoung,republican,False,AK-00,0.639384,...,,0.474385,,0.202538,,0.244528,,-0.00084,,2012
1,2014-AK-00,142572,279741,2014,Alaska,DonYoung,republican,False,AK-00,0.509657,...,-0.085546,0.876998,0.676923,-0.141715,0.05836,0.356498,0.243687,-0.014218,0.098593,2014
2,2016-AK-00,155088,308198,2016,Alaska,DonYoung,republican,False,AK-00,0.503209,...,0.327192,0.829523,0.735283,-0.205865,-0.111625,0.435842,0.34228,-0.059753,0.033809,2016
3,2018-AK-00,149779,282166,2018,Alaska,DonYoung,republican,False,AK-00,0.530819,...,-0.484065,0.640565,0.623658,-0.171456,-0.154549,0.35317,0.376089,-0.024959,-0.047878,2018
4,2012-AL-01,196374,200676,2012,Alabama,JoBonner,republican,False,AL-01,0.978562,...,-0.395903,-0.70232,0.469109,0.133987,-1.037442,-0.862029,0.328212,0.026307,-1.163934,2012


In [102]:
acs_election_merged.to_csv('ACS_election_merged_w_raw_vars_0815.csv')

In [103]:
list(acs_election_merged.columns)

['yr_district_id',
 'winner_votes',
 'totalvotes',
 'year_x',
 'state',
 'winner',
 'winner_party',
 'writein',
 'district_id_x',
 'winner_voteshare',
 'total_dem_votes',
 'total_rep_votes',
 'total_dem_vote_share',
 'total_rep_vote_share',
 'L1_winner',
 'L2_winner',
 'L3_winner',
 'L4_winner',
 'L5_winner',
 'dL1_winner',
 'dL2_winner',
 'dL3_winner',
 'dL4_winner',
 'dL5_winner',
 'incumbent_party',
 'incumbent_L5_races',
 'incumbent_L4_races',
 'incumbent_L3_races',
 'incumbent_L2_races',
 'rep_L1_wins',
 'rep_L5_wins',
 'rep_L4_wins',
 'rep_L3_wins',
 'rep_L2_wins',
 'dem_L1_wins',
 'dem_L5_wins',
 'dem_L4_wins',
 'dem_L3_wins',
 'dem_L2_wins',
 'rep_L1_voteshare',
 'rep_L5_voteshare',
 'rep_L4_voteshare',
 'rep_L3_voteshare',
 'rep_L2_voteshare',
 'dem_L1_voteshare',
 'dem_L5_voteshare',
 'dem_L4_voteshare',
 'dem_L3_voteshare',
 'dem_L2_voteshare',
 'target',
 'top_dem_votes',
 'top_dem_candidate',
 'top_rep_votes',
 'top_rep_candidate',
 'dem_incumbent_in_race',
 'rep_incumbent

In [104]:
dropvars_model = ['winner_votes',
                  'totalvotes',
                  'winner',
                  'winner_party',
                  'writein',
                  'total_dem_votes',
                  'total_rep_votes',
                  'L1_winner',
                  'L2_winner',
                  'L3_winner',
                  'L4_winner',
                  'L5_winner',
                  'incumbent_party',
                  'top_dem_votes',
                  'top_dem_candidate',
                  'top_rep_votes',
                  'top_rep_candidate',
                  'district_id_y',
                  'year_y',
                  'total_male',
                  'male_5_to_9_years',
                  'male_10_to_14_years',
                  'male_15_to_17_years',
                  'male_18_and_19_years',
                  'male_20_years',
                  'male_21_years',
                  'male_22_to_24_years',
                  'male_25_to_29_years',
                  'male_30_to_34_years',
                  'male_35_to_39_years',
                  'male_40_to_44_years',
                  'male_45_to_49_years',
                  'male_50_to_54_years',
                  'male_55_to_59_years',
                  'male_60_and_61_years',
                  'male_62_to_64_years',
                  'male_65_and_66_years',
                  'male_67_to_69_years',
                  'male_70_to_74_years',
                  'male_75_to_79_years',
                  'male_80_to_84_years',
                  'male_85_years_and_over',
                  'total_female',
                  'female_5_to_9_years',
                  'female_10_to_14_years',
                  'female_15_to_17_years',
                  'female_18_and_19_years',
                  'female_20_years',
                  'female_21_years',
                  'female_22_to_24_years',
                  'female_25_to_29_years',
                  'female_30_to_34_years',
                  'female_35_to_39_years',
                  'female_40_to_44_years',
                  'female_45_to_49_years',
                  'female_50_to_54_years',
                  'female_55_to_59_years',
                  'female_60_and_61_years',
                  'female_62_to_64_years',
                  'female_65_and_66_years',
                  'female_67_to_69_years',
                  'female_70_to_74_years',
                  'female_75_to_79_years',
                  'female_80_to_84_years',
                  'female_85_years_and_over',
                  'total_population',
                  'white_alone',
                  'black_or_african_american_alone',
                  'american_indian_and_alaska_native',
                  'asian_alone',
                  'native_hawaiian_and_other_pacific',
                  'some_other_race_alone',
                  'two_or_more_races',
                  'not_hispanic_or_latino',
                  'hispanic_or_latino',
                  'native',
                  'native_born_in_state_of_residence',
                  'native_born_in_other_state',
                  'native_born_outside_the_us',
                  'foreign_born',
                  'population_25_and_over',
                  'less_than_high_school_graduate',
                  'high_school_graduate',
                  'some_college_or_associate_degree',
                  'bachelors_degree',
                  'graduate_or_professional_degree',
                  'same_house_1_year_ago',
                  'moved_within_same_county',
                  'moved_from_different_county',
                  'moved_from_different_state',
                  'moved_from_abroad',
                  'workers_16_and_over',
                  'public_transportation',
                  'walked',
                  'worked_at_home',
                  'enrolled_in_school',
                  'enrolled_in_college',
                  'enrolled_in_school_graduate_professional',
                  'population_25_and_over.1',
                  'male_bachelors',
                  'male_master',
                  'male_professional_school',
                  'male_doctorate',
                  'female_bachelor',
                  'female_master',
                  'female_professional_school',
                  'female_doctorate',
                  'income_below_poverty',
                  'total_household_income',
                  'income_less_than_10000',
                  'income_10000_to_14999',
                  'income_15000_to_19999',
                  'income_20000_to_24999',
                  'income_25000_to_29999',
                  'income_30000_to_34999',
                  'income_35000_to_39999',
                  'income_40000_to_44999',
                  'income_45000_to_49999',
                  'income_50000_to_59999',
                  'income_60000_to_74999',
                  'income_75000_to_99999',
                  'income_100000_to_124999',
                  'income_125000_to_149999',
                  'income_150000_to_199999',
                  'income_200000_or_more',
                  'median_household_income',
                  'income_lowest_quintile',
                  'income_second_quintile',
                  'income_third_quintile',
                  'income_fourth_quintile',
                  'income_highest_quintile',
                  'income_top_5_percent',
                  'veteran',
                  'nonveteran',
                  'population_16_and_over',
                  'in_labor_force',
                  'civilian_in_labor_force',
                  'civilian_employed',
                  'civilian_unemployed',
                  'in_armed_forces',
                  'total_not_in_labor_force',
                  'fulltime_civilian_employed_16_and_over',
                  'male_natural_resources',
                  'male_production_transportation_and_material_moving',
                  'female_natural_resources',
                  'female_production_transportation_and_material_moving',
                  'total_housing_units',
                  'occupied',
                  'vacant',
                  'occupied_housing_units',
                  'owner_occupied',
                  'renter_occupied',
                  'owner_occupied_housing_units',
                  'housing_units_with_a_mortgage',
                  'housing_units_without_a_mortgage',
                  'renter_occupied_housing_units',
                  'median_gross_rent',
                  'total_owner_occupied_housing_units',
                  'housing_value_below_10000',
                  'housing_value_10000_to_14999',
                  'housing_value_15000_to_19999',
                  'housing_value_20000_to_24999',
                  'housing_value_25000_to_29999',
                  'housing_value_30000_to_34999',
                  'housing_value_35000_to_39999',
                  'housing_value_40000_to_49999',
                  'housing_value_50000_to_59999',
                  'housing_value_60000_to_69999',
                  'housing_value_70000_to_79999',
                  'housing_value_80000_to_89999',
                  'housing_value_90000_to_99999',
                  'housing_value_100000_to_124999',
                  'housing_value_125000_to_149999',
                  'housing_value_150000_to_174999',
                  'housing_value_175000_to_199999',
                  'housing_value_200000_to_249999',
                  'housing_value_250000_to_299999',
                  'housing_value_300000_to_399999',
                  'housing_value_400000_to_499999',
                  'housing_value_500000_to_749999',
                  'housing_value_750000_to_999999',
                  'housing_value_1000000_to_1499999',
                  'median_monthly_owner_costs',
                  'm75_above',
                  'f75_above',
                  'm60_74',
                  'f60_74',
                  'm45_59',
                  'f45_59',
                  'm30_44',
                  'f30_44',
                  'm18_29',
                  'f18_29',
                  'm18_below',
                  'f18_below',
                  'm18_above',
                  'f18_above',
                  'm25_above',
                  'f25_above',
                  'l1_mpop_share',
                  'l2_mpop_share',
                  'diff1_mpop_share',
                  'diff2_mpop_share',
                  'l1_fpop_share',
                  'l2_fpop_share',
                  'diff1_fpop_share',
                  'diff2_fpop_share',
                  'l1_m18_below_share',
                  'l2_m18_below_share',
                  'diff1_m18_below_share',
                  'diff2_m18_below_share',
                  'l1_f18_below_share',
                  'l2_f18_below_share',
                  'diff1_f18_below_share',
                  'diff2_f18_below_share',
                  'l1_18_below_share',
                  'l2_18_below_share',
                  'diff1_18_below_share',
                  'diff2_18_below_share',
                  'l1_m18_above_share',
                  'l2_m18_above_share',
                  'diff1_m18_above_share',
                  'diff2_m18_above_share',
                  'l1_f18_above_share',
                  'l2_f18_above_share',
                  'diff1_f18_above_share',
                  'diff2_f18_above_share',
                  'l1_18_above_share',
                  'l2_18_above_share',
                  'diff1_18_above_share',
                  'diff2_18_above_share',
                  'l1_m18_29_share',
                  'l2_m18_29_share',
                  'diff1_m18_29_share',
                  'diff2_m18_29_share',
                  'l1_f18_29_share',
                  'l2_f18_29_share',
                  'diff1_f18_29_share',
                  'diff2_f18_29_share',
                  'l1_18_29_share',
                  'l2_18_29_share',
                  'diff1_18_29_share',
                  'diff2_18_29_share',
                  'l1_m30_44_share',
                  'l2_m30_44_share',
                  'diff1_m30_44_share',
                  'diff2_m30_44_share',
                  'l1_f30_44_share',
                  'l2_f30_44_share',
                  'diff1_f30_44_share',
                  'diff2_f30_44_share',
                  'l1_30_44_share',
                  'l2_30_44_share',
                  'diff1_30_44_share',
                  'diff2_30_44_share',
                  'l1_m45_59_share',
                  'l2_m45_59_share',
                  'diff1_m45_59_share',
                  'diff2_m45_59_share',
                  'l1_f45_59_share',
                  'l2_f45_59_share',
                  'diff1_f45_59_share',
                  'diff2_f45_59_share',
                  'l1_45_59_share',
                  'l2_45_59_share',
                  'diff1_45_59_share',
                  'diff2_45_59_share',
                  'l1_m60_74_share',
                  'l2_m60_74_share',
                  'diff1_m60_74_share',
                  'diff2_m60_74_share',
                  'l1_f60_74_share',
                  'l2_f60_74_share',
                  'diff1_f60_74_share',
                  'diff2_f60_74_share',
                  'l1_60_74_share',
                  'l2_60_74_share',
                  'diff1_60_74_share',
                  'diff2_60_74_share',
                  'l1_m75_above_share',
                  'l2_m75_above_share',
                  'diff1_m75_above_share',
                  'diff2_m75_above_share',
                  'l1_f75_above_share',
                  'l2_f75_above_share',
                  'diff1_f75_above_share',
                  'diff2_f75_above_share',
                  'l1_75_above_share',
                  'l2_75_above_share',
                  'diff1_75_above_share',
                  'diff2_75_above_share',
                  'l1_white_share',
                  'l2_white_share',
                  'diff1_white_share',
                  'diff2_white_share',
                  'l1_black_share',
                  'l2_black_share',
                  'diff1_black_share',
                  'diff2_black_share',
                  'l1_asian_share',
                  'l2_asian_share',
                  'diff1_asian_share',
                  'diff2_asian_share',
                  'l1_hispanic_share',
                  'l2_hispanic_share',
                  'diff1_hispanic_share',
                  'diff2_hispanic_share',
                  'l1_otherrace_share',
                  'l2_otherrace_share',
                  'diff1_otherrace_share',
                  'diff2_otherrace_share',
                  'l1_native_share',
                  'l2_native_share',
                  'diff1_native_share',
                  'diff2_native_share',
                  'l1_nativeinstate_share',
                  'l2_nativeinstate_share',
                  'diff1_nativeinstate_share',
                  'diff2_nativeinstate_share',
                  'l1_nativeoutofstate_share',
                  'l2_nativeoutofstate_share',
                  'diff1_nativeoutofstate_share',
                  'diff2_nativeoutofstate_share',
                  'l1_foreignborn_share',
                  'l2_foreignborn_share',
                  'diff1_foreignborn_share',
                  'diff2_foreignborn_share',
                  'l1_hs_below_share',
                  'l2_hs_below_share',
                  'diff1_hs_below_share',
                  'diff2_hs_below_share',
                  'l1_hs_share',
                  'l2_hs_share',
                  'diff1_hs_share',
                  'diff2_hs_share',
                  'l1_somecollege_share',
                  'l2_somecollege_share',
                  'diff1_somecollege_share',
                  'diff2_somecollege_share',
                  'l1_college_share',
                  'l2_college_share',
                  'diff1_college_share',
                  'diff2_college_share',
                  'l1_graddeg_share',
                  'l2_graddeg_share',
                  'diff1_graddeg_share',
                  'diff2_graddeg_share',
                  'l1_samehouse_share',
                  'l2_samehouse_share',
                  'diff1_samehouse_share',
                  'diff2_samehouse_share',
                  'l1_samecounty_share',
                  'l2_samecounty_share',
                  'diff1_samecounty_share',
                  'diff2_samecounty_share',
                  'l1_samestate_share',
                  'l2_samestate_share',
                  'diff1_samestate_share',
                  'diff2_samestate_share',
                  'l1_diffstate_share',
                  'l2_diffstate_share',
                  'diff1_diffstate_share',
                  'diff2_diffstate_share',
                  'l1_liveabroad_share',
                  'l2_liveabroad_share',
                  'diff1_liveabroad_share',
                  'diff2_liveabroad_share',
                  'l1_ptransport_share',
                  'l2_ptransport_share',
                  'diff1_ptransport_share',
                  'diff2_ptransport_share',
                  'l1_walktowork_share',
                  'l2_walktowork_share',
                  'diff1_walktowork_share',
                  'diff2_walktowork_share',
                  'l1_workathome_share',
                  'l2_workathome_share',
                  'diff1_workathome_share',
                  'diff2_workathome_share',
                  'l1_inschool_share',
                  'l2_inschool_share',
                  'diff1_inschool_share',
                  'diff2_inschool_share',
                  'l1_incollege_share',
                  'l2_incollege_share',
                  'diff1_incollege_share',
                  'diff2_incollege_share',
                  'l1_ingradschool_share',
                  'l2_ingradschool_share',
                  'diff1_ingradschool_share',
                  'diff2_ingradschool_share',
                  'l1_m_college_share',
                  'l2_m_college_share',
                  'diff1_m_college_share',
                  'diff2_m_college_share',
                  'l1_m_graddeg_share',
                  'l2_m_graddeg_share',
                  'diff1_m_graddeg_share',
                  'diff2_m_graddeg_share',
                  'l1_m_phd_share',
                  'l2_m_phd_share',
                  'diff1_m_phd_share',
                  'diff2_m_phd_share',
                  'l1_f_college_share',
                  'l2_f_college_share',
                  'diff1_f_college_share',
                  'diff2_f_college_share',
                  'l1_f_graddeg_share',
                  'l2_f_graddeg_share',
                  'diff1_f_graddeg_share',
                  'diff2_f_graddeg_share',
                  'l1_f_phd_share',
                  'l2_f_phd_share',
                  'diff1_f_phd_share',
                  'diff2_f_phd_share',
                  'l1_poverty_share',
                  'l2_poverty_share',
                  'diff1_poverty_share',
                  'diff2_poverty_share',
                  'l1_hhinc_10k_less_share',
                  'l2_hhinc_10k_less_share',
                  'diff1_hhinc_10k_less_share',
                  'diff2_hhinc_10k_less_share',
                  'l1_hhinc_30k_less_share',
                  'l2_hhinc_30k_less_share',
                  'diff1_hhinc_30k_less_share',
                  'diff2_hhinc_30k_less_share',
                  'l1_hhinc_50k_less_share',
                  'l2_hhinc_50k_less_share',
                  'diff1_hhinc_50k_less_share',
                  'diff2_hhinc_50k_less_share',
                  'l1_hhinc_75k_more_share',
                  'l2_hhinc_75k_more_share',
                  'diff1_hhinc_75k_more_share',
                  'diff2_hhinc_75k_more_share',
                  'l1_hhinc_100k_more_share',
                  'l2_hhinc_100k_more_share',
                  'diff1_hhinc_100k_more_share',
                  'diff2_hhinc_100k_more_share',
                  'l1_hhinc_125k_more_share',
                  'l2_hhinc_125k_more_share',
                  'diff1_hhinc_125k_more_share',
                  'diff2_hhinc_125k_more_share',
                  'l1_hhinc_150k_more_share',
                  'l2_hhinc_150k_more_share',
                  'diff1_hhinc_150k_more_share',
                  'diff2_hhinc_150k_more_share',
                  'l1_hhinc_200k_more_share',
                  'l2_hhinc_200k_more_share',
                  'diff1_hhinc_200k_more_share',
                  'diff2_hhinc_200k_more_share',
                  'l1_veteran_share',
                  'l2_veteran_share',
                  'diff1_veteran_share',
                  'diff2_veteran_share',
                  'l1_lfp_share',
                  'l2_lfp_share',
                  'diff1_lfp_share',
                  'diff2_lfp_share',
                  'l1_unemp_rate',
                  'l2_unemp_rate',
                  'diff1_unemp_rate',
                  'diff2_unemp_rate',
                  'l1_armedforce_share',
                  'l2_armedforce_share',
                  'diff1_armedforce_share',
                  'diff2_armedforce_share',
                  'l1_naturalresourcesemp_share',
                  'l2_naturalresourcesemp_share',
                  'diff1_naturalresourcesemp_share',
                  'diff2_naturalresourcesemp_share',
                  'l1_bluecollaremp_share',
                  'l2_bluecollaremp_share',
                  'diff1_bluecollaremp_share',
                  'diff2_bluecollaremp_share',
                  'l1_vacanthousing_share',
                  'l2_vacanthousing_share',
                  'diff1_vacanthousing_share',
                  'diff2_vacanthousing_share',
                  'l1_renter_share',
                  'l2_renter_share',
                  'diff1_renter_share',
                  'diff2_renter_share',
                  'l1_mortgage_share',
                  'l2_mortgage_share',
                  'diff1_mortgage_share',
                  'diff2_mortgage_share',
                  'l1_housevalue_10k_less_share',
                  'l2_housevalue_10k_less_share',
                  'diff1_housevalue_10k_less_share',
                  'diff2_housevalue_10k_less_share',
                  'l1_housevalue_50k_less_share',
                  'l2_housevalue_50k_less_share',
                  'diff1_housevalue_50k_less_share',
                  'diff2_housevalue_50k_less_share',
                  'l1_housevalue_100k_less_share',
                  'l2_housevalue_100k_less_share',
                  'diff1_housevalue_100k_less_share',
                  'diff2_housevalue_100k_less_share',
                  'l1_housevalue_300k_more_share',
                  'l2_housevalue_300k_more_share',
                  'diff1_housevalue_300k_more_share',
                  'diff2_housevalue_300k_more_share',
                  'l1_housevalue_500k_more_share',
                  'l2_housevalue_500k_more_share',
                  'diff1_housevalue_500k_more_share',
                  'diff2_housevalue_500k_more_share',
                  'l1_housevalue_750k_more_share',
                  'l2_housevalue_750k_more_share',
                  'diff1_housevalue_750k_more_share',
                  'diff2_housevalue_750k_more_share',
                  'l1_housevalue_1m_more_share',
                  'l2_housevalue_1m_more_share',
                  'diff1_housevalue_1m_more_share',
                  'diff2_housevalue_1m_more_share',
                  'L2_median_household_income_scaled',
                  'diff1_median_household_income_scaled',
                  'diff2_median_household_income_scaled',
                  'L2_income_lowest_quintile_scaled',
                  'diff1_income_lowest_quintile_scaled',
                  'diff2_income_lowest_quintile_scaled',
                  'L2_income_second_quintile_scaled',
                  'diff1_income_second_quintile_scaled',
                  'diff2_income_second_quintile_scaled',
                  'L2_income_third_quintile_scaled',
                  'diff1_income_third_quintile_scaled',
                  'diff2_income_third_quintile_scaled',
                  'L2_income_fourth_quintile_scaled',
                  'diff1_income_fourth_quintile_scaled',
                  'diff2_income_fourth_quintile_scaled',
                  'L2_income_highest_quintile_scaled',
                  'diff1_income_highest_quintile_scaled',
                  'diff2_income_highest_quintile_scaled',
                  'L2_income_top_5_percent_scaled',
                  'diff1_income_top_5_percent_scaled',
                  'diff2_income_top_5_percent_scaled',
                  'L2_median_gross_rent_scaled',
                  'diff1_median_gross_rent_scaled',
                  'diff2_median_gross_rent_scaled',
                  'L2_median_monthly_owner_costs_scaled',
                  'diff1_median_monthly_owner_costs_scaled',
                  'diff2_median_monthly_owner_costs_scaled',
                  'year_merge']

In [105]:
regex_l1 = re.compile(r'^l1', re.UNICODE)
l1_to_drop = set(list(filter(regex_l1.search, dropvars_model)))
l1_to_drop

{'l1_18_29_share',
 'l1_18_above_share',
 'l1_18_below_share',
 'l1_30_44_share',
 'l1_45_59_share',
 'l1_60_74_share',
 'l1_75_above_share',
 'l1_armedforce_share',
 'l1_asian_share',
 'l1_black_share',
 'l1_bluecollaremp_share',
 'l1_college_share',
 'l1_diffstate_share',
 'l1_f18_29_share',
 'l1_f18_above_share',
 'l1_f18_below_share',
 'l1_f30_44_share',
 'l1_f45_59_share',
 'l1_f60_74_share',
 'l1_f75_above_share',
 'l1_f_college_share',
 'l1_f_graddeg_share',
 'l1_f_phd_share',
 'l1_foreignborn_share',
 'l1_fpop_share',
 'l1_graddeg_share',
 'l1_hhinc_100k_more_share',
 'l1_hhinc_10k_less_share',
 'l1_hhinc_125k_more_share',
 'l1_hhinc_150k_more_share',
 'l1_hhinc_200k_more_share',
 'l1_hhinc_30k_less_share',
 'l1_hhinc_50k_less_share',
 'l1_hhinc_75k_more_share',
 'l1_hispanic_share',
 'l1_housevalue_100k_less_share',
 'l1_housevalue_10k_less_share',
 'l1_housevalue_1m_more_share',
 'l1_housevalue_300k_more_share',
 'l1_housevalue_500k_more_share',
 'l1_housevalue_50k_less_share

In [106]:
regex_diff1 = re.compile(r'^diff1', re.UNICODE)
diff1_to_drop = set(list(filter(regex_diff1.search, dropvars_model)))
diff1_to_drop

{'diff1_18_29_share',
 'diff1_18_above_share',
 'diff1_18_below_share',
 'diff1_30_44_share',
 'diff1_45_59_share',
 'diff1_60_74_share',
 'diff1_75_above_share',
 'diff1_armedforce_share',
 'diff1_asian_share',
 'diff1_black_share',
 'diff1_bluecollaremp_share',
 'diff1_college_share',
 'diff1_diffstate_share',
 'diff1_f18_29_share',
 'diff1_f18_above_share',
 'diff1_f18_below_share',
 'diff1_f30_44_share',
 'diff1_f45_59_share',
 'diff1_f60_74_share',
 'diff1_f75_above_share',
 'diff1_f_college_share',
 'diff1_f_graddeg_share',
 'diff1_f_phd_share',
 'diff1_foreignborn_share',
 'diff1_fpop_share',
 'diff1_graddeg_share',
 'diff1_hhinc_100k_more_share',
 'diff1_hhinc_10k_less_share',
 'diff1_hhinc_125k_more_share',
 'diff1_hhinc_150k_more_share',
 'diff1_hhinc_200k_more_share',
 'diff1_hhinc_30k_less_share',
 'diff1_hhinc_50k_less_share',
 'diff1_hhinc_75k_more_share',
 'diff1_hispanic_share',
 'diff1_housevalue_100k_less_share',
 'diff1_housevalue_10k_less_share',
 'diff1_housevalue_

In [107]:
len(set(dropvars_model))-len(l1_to_drop)-len(diff1_to_drop)

376

In [108]:
dropvars_model = set(dropvars_model) - l1_to_drop - diff1_to_drop
len(dropvars_model)

376

In [128]:
acs_election_merged.shape

(1740, 681)

In [133]:
acs_election_merged_dropped = acs_election_merged.drop(dropvars_model, axis = 1)

In [134]:
acs_election_merged_dropped.shape

(1740, 305)

In [135]:
acs_election_merged_dropped_14_18 = acs_election_merged_dropped[acs_election_merged_dropped.year_x != 2012]


In [136]:
acs_election_merged_dropped_14_18.shape

(1305, 305)

In [139]:
acs_election_merged_dropped_14_18.district_id_x.nunique()

435

In [140]:
acs_election_merged_dropped_14_18.year_x.unique()

array([2014, 2016, 2018])

In [116]:
acs_election_merged_dropped

Unnamed: 0,yr_district_id,year_x,state,district_id_x,winner_voteshare,total_dem_vote_share,total_rep_vote_share,dL1_winner,dL2_winner,dL3_winner,...,L1_income_fourth_quintile_scaled,diff1_income_fourth_quintile_scaled,L1_income_highest_quintile_scaled,diff1_income_highest_quintile_scaled,L1_income_top_5_percent_scaled,diff1_income_top_5_percent_scaled,L1_median_gross_rent_scaled,diff1_median_gross_rent_scaled,L1_median_monthly_owner_costs_scaled,diff1_median_monthly_owner_costs_scaled
0,2012-AK-00,2012,Alaska,AK-00,0.639384,0.286149,0.691240,1,1,1,...,0.966714,-0.147039,0.431134,-0.126186,0.169088,-0.063642,0.474385,0.202538,0.244528,-0.000840
1,2014-AK-00,2014,Alaska,AK-00,0.509657,0.409672,0.585763,1,1,1,...,0.870943,0.118137,0.407658,-0.109221,0.111978,-0.092078,0.876998,-0.141715,0.356498,-0.014218
2,2016-AK-00,2016,Alaska,AK-00,0.503209,0.360220,0.606292,1,1,1,...,0.892237,-0.128118,0.285165,0.138985,0.023688,0.323404,0.829523,-0.205865,0.435842,-0.059753
3,2018-AK-00,2018,Alaska,AK-00,0.530819,0.464971,0.530819,1,1,1,...,0.760722,-0.229935,0.124363,-0.060398,-0.198030,0.061056,0.640565,-0.171456,0.353170,-0.024959
4,2012-AL-01,2012,Alabama,AL-01,0.978562,0.000000,0.978562,1,1,1,...,-0.796561,0.174819,-0.478695,-0.105975,-0.376052,-0.156825,-0.702320,0.133987,-0.862029,0.026307
5,2014-AL-01,2014,Alabama,AL-01,0.681569,0.317130,0.681569,0,0,0,...,-0.767830,-0.046758,-0.732664,-0.006424,-0.718340,0.007191,-0.613210,-0.051314,-0.900646,-0.041630
6,2016-AL-01,2016,Alabama,AL-01,0.963825,0.000000,0.963825,1,0,0,...,-0.756000,-0.079542,-0.714787,0.098492,-0.716424,0.192309,-0.599062,-0.084289,-0.841109,-0.095664
7,2018-AL-01,2018,Alabama,AL-01,0.631563,0.367765,0.631563,1,1,0,...,-0.724790,-0.080924,-0.672554,-0.002775,-0.697448,0.108855,-0.686589,-0.027177,-0.880409,-0.024897
8,2012-AL-02,2012,Alabama,AL-02,0.635989,0.363060,0.635989,1,0,0,...,-0.799948,-0.002711,-0.808006,0.011552,-0.810622,0.082367,-0.918627,-0.043614,-1.088277,-0.020951
9,2014-AL-02,2014,Alabama,AL-02,0.673425,0.325641,0.673425,1,1,0,...,-0.838412,0.022046,-0.797053,-0.093187,-0.805814,-0.075982,-0.829596,-0.098375,-1.143065,0.004196


In [141]:
acs_election_merged_dropped_14_18.isna().sum()

yr_district_id                             0
year_x                                     0
state                                      0
district_id_x                              0
winner_voteshare                           0
total_dem_vote_share                       0
total_rep_vote_share                       0
dL1_winner                                 0
dL2_winner                                 0
dL3_winner                                 0
dL4_winner                                 0
dL5_winner                                 0
incumbent_L5_races                         0
incumbent_L4_races                         0
incumbent_L3_races                         0
incumbent_L2_races                         0
rep_L1_wins                                0
rep_L5_wins                                0
rep_L4_wins                                0
rep_L3_wins                                0
rep_L2_wins                                0
dem_L1_wins                                0
dem_L5_win

In [142]:
acs_election_merged_dropped_14_18 = acs_election_merged_dropped_14_18.dropna(axis='columns')

In [144]:
acs_election_merged_dropped_14_18.shape

(1305, 278)

In [145]:
dropped_cols = set(acs_election_merged_dropped) - set(acs_election_merged_dropped_14_18.columns)
dropped_cols

{'bluecollaremp_share',
 'diff1_bluecollaremp_share',
 'diff1_housevalue_100k_less_share',
 'diff1_housevalue_10k_less_share',
 'diff1_housevalue_1m_more_share',
 'diff1_housevalue_300k_more_share',
 'diff1_housevalue_500k_more_share',
 'diff1_housevalue_50k_less_share',
 'diff1_housevalue_750k_more_share',
 'diff1_naturalresourcesemp_share',
 'housevalue_100k_less_share',
 'housevalue_10k_less_share',
 'housevalue_1m_more_share',
 'housevalue_300k_more_share',
 'housevalue_500k_more_share',
 'housevalue_50k_less_share',
 'housevalue_750k_more_share',
 'l1_bluecollaremp_share',
 'l1_housevalue_100k_less_share',
 'l1_housevalue_10k_less_share',
 'l1_housevalue_1m_more_share',
 'l1_housevalue_300k_more_share',
 'l1_housevalue_500k_more_share',
 'l1_housevalue_50k_less_share',
 'l1_housevalue_750k_more_share',
 'l1_naturalresourcesemp_share',
 'naturalresourcesemp_share'}

Save to `.csv`

In [146]:
acs_election_merged_dropped_14_18.to_csv('ACS_HouseVotes_L1_14_18_0815.csv', index=False)

In [110]:
districts_to_drop = ['AZ-09', 'FL-26', 'FL-27', 'GA-14', 'NV-04',
                     'SC-07', 'TX-33', 'TX-34', 'TX-35', 'TX-36', 'UT-04', 'WA-10']