## District-level house election voting records data cleaning

In [2]:
import pandas as pd
import numpy as np
import math
import re
from datetime import datetime
import time

In [90]:
df = pd.read_csv('House_Votes.csv', encoding = 'unicode_escape')

In [91]:
df['dnum'] = df['district'].apply(lambda x: '{0:0>2}'.format(x))

In [92]:
df['dnum'] = df.dnum.apply(str)

In [93]:
df['year'] = df.year.apply(str)

In [94]:
df['district_id'] = df[['state_po', 'dnum']].apply(lambda x: '-'.join(x), axis=1)

In [95]:
df['yr_district_id'] = df[['year', 'district_id']].apply(lambda x: '-'.join(x), axis=1)

In [96]:
df.drop(['state_po', 'state_fips', 'state_cen', 'state_ic',
       'office', 'district', 'mode', 'unofficial', 'version', 'dnum'], axis = 1, inplace = True)

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29636 entries, 0 to 29635
Data columns (total 13 columns):
year              29636 non-null object
state             29636 non-null object
stage             29606 non-null object
runoff            20980 non-null object
special           29636 non-null bool
candidate         27515 non-null object
party             26208 non-null object
writein           29636 non-null bool
candidatevotes    29636 non-null int64
totalvotes        29636 non-null int64
Unnamed: 19       46 non-null float64
district_id       29636 non-null object
yr_district_id    29636 non-null object
dtypes: bool(2), float64(1), int64(2), object(8)
memory usage: 2.5+ MB


In [98]:
df = df[(df.stage != 'pri') & (df.runoff != True) & (df.special != True)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29538 entries, 0 to 29635
Data columns (total 13 columns):
year              29538 non-null object
state             29538 non-null object
stage             29538 non-null object
runoff            20912 non-null object
special           29538 non-null bool
candidate         27422 non-null object
party             26115 non-null object
writein           29538 non-null bool
candidatevotes    29538 non-null int64
totalvotes        29538 non-null int64
Unnamed: 19       46 non-null float64
district_id       29538 non-null object
yr_district_id    29538 non-null object
dtypes: bool(2), float64(1), int64(2), object(8)
memory usage: 2.8+ MB


In [99]:
df.year.value_counts()

2000    1608
1996    1594
1992    1524
2010    1500
2016    1440
2002    1431
2012    1427
2008    1381
1998    1373
2018    1359
2014    1357
2006    1344
2004    1340
1976    1306
1994    1301
1980    1285
1982    1275
1984    1171
1990    1155
1978    1130
1988    1129
1986    1108
Name: year, dtype: int64

In [100]:
22*435

9570

In [101]:
df.party.unique()

array(['democrat', 'republican', nan, 'prohibition', 'national democrat',
       'independent', 'libertarian', 'peace and freedom',
       'american independent', 'u.s. labor', 'socialist workers',
       'american', 'la raza unida', 'indpendent', 'communist',
       'conservative', 'socialist labor',
       'independents for godly government', "people's", 'workers',
       'white power', 'human rights', 'independent american',
       'new majority', 'labor', 'regular democracy', 'pro-life',
       'restoration', 'individual needs center',
       "independent taxpayer's watchdog", 'politicians are crooks',
       'jobs, equality, peace', 'consumer action', 'bring us together',
       'individual americans independence', 'silent majority',
       "people's independent", 'liberal', 'mayflower', 'coequal citizens',
       'revolutionary workers', 'independent conservatives',
       'constitution', 'citizens for haas', 'united states labor',
       'aloha democratic', 'socialist', "worker'

In [102]:
df.party.value_counts()


democrat                            9026
republican                          8801
libertarian                         2559
independent                         1115
conservative                         620
green                                491
natural law                          371
liberal                              266
working families                     245
independence                         229
right to life                        225
reform                               173
peace and freedom                    165
constitution                         140
socialist workers                    131
american independent                 104
democratic-farmer-labor              101
american                              72
none                                  67
u.s. taxpayers                        63
no party affiliation                  63
independent american                  47
u.s. labor                            36
women's equality                      32
other           

In [103]:
df.head()

Unnamed: 0,year,state,stage,runoff,special,candidate,party,writein,candidatevotes,totalvotes,Unnamed: 19,district_id,yr_district_id
0,1976,Alabama,gen,False,False,Bill Davenport,democrat,False,58906,157170,0.374792,AL-01,1976-AL-01
1,1976,Alabama,gen,False,False,Jack Edwards,republican,False,98257,157170,0.625164,AL-01,1976-AL-01
2,1976,Alabama,gen,False,False,,,True,7,157170,4.5e-05,AL-01,1976-AL-01
3,1976,Alabama,gen,False,False,J. Carole Keahey,democrat,False,66288,156362,0.423939,AL-02,1976-AL-02
4,1976,Alabama,gen,False,False,,,True,5,156362,3.2e-05,AL-02,1976-AL-02


In [104]:
df['year'] = df.year.apply(int)

In [105]:
def func(x):
     return pd.Series(dict(candidatevotes = x['candidatevotes'].max(), 
                           totalvotes = x['candidatevotes'].sum()))

In [106]:
winner = pd.DataFrame(df.groupby(['yr_district_id']).apply(func))

In [107]:
winner.reset_index(inplace=True)

In [108]:
winner_df = pd.merge(winner, df[['year', 'state', 'candidate', 'party', 'writein',
       'candidatevotes', 'district_id', 'yr_district_id']], on=['yr_district_id', 'candidatevotes'])
winner_df

Unnamed: 0,yr_district_id,candidatevotes,totalvotes,year,state,candidate,party,writein,district_id
0,1976-AK-00,83722,118208,1976,Alaska,Don Young,republican,False,AK-00
1,1976-AL-01,98257,157170,1976,Alabama,Jack Edwards,republican,False,AL-01
2,1976-AL-02,90069,156362,1976,Alabama,"William L. """"Bill"""" Dickinson",republican,False,AL-02
3,1976-AL-03,106935,108048,1976,Alabama,Bill Nichols,democrat,False,AL-03
4,1976-AL-04,141490,176022,1976,Alabama,Tom Bevill,democrat,False,AL-04
5,1976-AL-05,113553,113560,1976,Alabama,Ronnie G. Flippo,democrat,False,AL-05
6,1976-AL-06,92113,162518,1976,Alabama,"John H. Buchanan, Jr.",republican,False,AL-06
7,1976-AL-07,110496,110501,1976,Alabama,Walter Flowers,democrat,False,AL-07
8,1976-AR-01,116217,168782,1976,Arkansas,Bill Alexander,democrat,False,AR-01
9,1976-AR-02,144780,167607,1976,Arkansas,Jim Guy Tucker,democrat,False,AR-02


In [109]:
winner_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9553 entries, 0 to 9552
Data columns (total 9 columns):
yr_district_id    9553 non-null object
candidatevotes    9553 non-null int64
totalvotes        9553 non-null int64
year              9553 non-null int64
state             9553 non-null object
candidate         9553 non-null object
party             9549 non-null object
writein           9553 non-null bool
district_id       9553 non-null object
dtypes: bool(1), int64(3), object(5)
memory usage: 681.0+ KB


In [110]:
winner_df['voteshare'] = winner_df.candidatevotes / winner_df.totalvotes

In [111]:
winner_df.voteshare.describe()

count    9552.000000
mean        0.670675
std         0.134570
min         0.274997
25%         0.573844
50%         0.648718
75%         0.731232
max         1.000000
Name: voteshare, dtype: float64

In [112]:
len(winner_df[winner_df.voteshare < 0.5])

391

In [113]:
winner_df.columns

Index(['yr_district_id', 'candidatevotes', 'totalvotes', 'year', 'state',
       'candidate', 'party', 'writein', 'district_id', 'voteshare'],
      dtype='object')

In [114]:
winner_df.columns = ['yr_district_id', 'winner_votes', 'totalvotes', 'year', 'state',
       'winner', 'winner_party', 'writein', 'district_id', 'winner_voteshare']

In [115]:
winner_df.head()

Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare
0,1976-AK-00,83722,118208,1976,Alaska,Don Young,republican,False,AK-00,0.70826
1,1976-AL-01,98257,157170,1976,Alabama,Jack Edwards,republican,False,AL-01,0.625164
2,1976-AL-02,90069,156362,1976,Alabama,"William L. """"Bill"""" Dickinson",republican,False,AL-02,0.576029
3,1976-AL-03,106935,108048,1976,Alabama,Bill Nichols,democrat,False,AL-03,0.989699
4,1976-AL-04,141490,176022,1976,Alabama,Tom Bevill,democrat,False,AL-04,0.80382


In [116]:
winner_df.loc[winner_df["winner_party"] == 'independent']

Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare
3457,1990-VT-00,117522,209856,1990,Vermont,Bernie Sanders,independent,False,VT-00,0.560013
3892,1992-VT-00,162724,281162,1992,Vermont,Bernard Sanders,independent,False,VT-00,0.578755
4327,1994-VT-00,105502,211449,1994,Vermont,Bernard Sanders,independent,False,VT-00,0.498948
4574,1996-MO-08,112472,222854,1996,Missouri,Jo Ann Emerson,independent,False,MO-08,0.504689
4749,1996-VT-00,140678,254706,1996,Vermont,Bernard Sanders,independent,False,VT-00,0.552315
5184,1998-VT-00,136403,215133,1998,Vermont,Bernard Sanders,independent,False,VT-00,0.63404
5612,2000-VA-05,143312,212705,2000,Virginia,"Virgil H. Goode, Jr.",independent,False,VA-05,0.673759
5619,2000-VT-00,196118,283366,2000,Vermont,Bernard Sanders,independent,False,VT-00,0.692101
6055,2002-VT-00,144880,225255,2002,Vermont,Bernard Sanders,independent,False,VT-00,0.643182
6490,2004-VT-00,205774,305008,2004,Vermont,Bernard Sanders,independent,False,VT-00,0.674651


In [117]:
winner_df["winner_party"]= winner_df["winner_party"].replace("democratic-farmer-labor", "democrat") 

In [118]:
winner_df["winner_party"]= winner_df["winner_party"].replace("foglietta (democrat)", "democrat") 

In [119]:
winner_df["winner_party"]= winner_df["winner_party"].replace("independent-republican", "republican") 

In [120]:
winner_df.loc[winner_df["winner"] == 'Bernie Sanders', 'winner_party'] = 'democrat'
winner_df.loc[winner_df["winner"] == 'Bernard Sanders', 'winner_party'] = 'democrat'
winner_df.loc[winner_df["winner"] == 'Jo Ann Emerson', 'winner_party'] = 'republican'
winner_df.loc[winner_df["winner"] == 'Virgil H. Goode, Jr.', 'winner_party'] = 'republican'

In [121]:
winner_df.winner_party.value_counts()

democrat      5144
republican    4405
Name: winner_party, dtype: int64

In [122]:
winner_df[winner_df.isna().any(axis=1)]

Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare
5720,2002-CO-06,158851,237501,2002,Colorado,Thomas G. Tancredo,,False,CO-06,0.668843
8793,2016-FL-24,0,0,2016,Florida,Frederica S. Wilson,democrat,False,FL-24,
9123,2018-AL-05,159063,260673,2018,Alabama,Mo Brooks,,False,AL-05,0.610201
9373,2018-NJ-04,163065,294348,2018,New Jersey,Christopher H. Smith,,False,NJ-04,0.553987
9548,2018-WI-08,209410,328774,2018,Wisconsin,Mike Gallagher,,False,WI-08,0.636942


In [123]:
winner_df.loc[winner_df["winner"] == 'Thomas G. Tancredo', 'winner_party'] = 'republican'
winner_df.loc[winner_df["winner"] == 'Mo Brooks', 'winner_party'] = 'republican'
winner_df.loc[winner_df["winner"] == 'Christopher H. Smith', 'winner_party'] = 'republican'
winner_df.loc[winner_df["winner"] == 'Mike Gallagher', 'winner_party'] = 'republican'

In [133]:
winner_df.loc[(winner_df["winner"] == 'Frederica S. Wilson') & (winner_df["year"] == 2016), 'winner_votes'] = 1

In [135]:
winner_df.loc[(winner_df["winner"] == 'Frederica S. Wilson') & (winner_df["year"] == 2016), 'totalvotes'] = 1
winner_df.loc[(winner_df["winner"] == 'Frederica S. Wilson') & (winner_df["year"] == 2016), 'winner_voteshare'] = 1

In [137]:
winner_df.loc[winner_df["winner"] == 'Frederica S. Wilson']

Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare
7480,2010-FL-17,106361,123370,2010,Florida,Frederica S. Wilson,democrat,False,FL-17,0.86213
7923,2012-FL-24,1,1,2012,Florida,Frederica S. Wilson,democrat,False,FL-24,1.0
8358,2014-FL-24,129192,149918,2014,Florida,Frederica S. Wilson,democrat,False,FL-24,0.861751
8793,2016-FL-24,1,1,2016,Florida,Frederica S. Wilson,democrat,False,FL-24,1.0


In [494]:
df1.party.value_counts()

republican                                    11995
democrat                                      10024
independent                                    1104
natural law                                     371
working families                                245
independence                                    229
right to life                                   225
reform                                          173
peace and freedom                               165
constitution                                    140
american independent                            104
american                                         72
none                                             67
no party affiliation                             63
u.s. taxpayers                                   63
independent american                             47
u.s. labor                                       36
women's equality                                 32
other                                            26
citizens    

In [138]:
df1 = df.copy()

In [139]:
df1["party"]= df1["party"].replace("green", "democrat") 

In [140]:
df1["party"]= df1["party"].replace("liberal", "democrat") 

In [141]:
df1["party"]= df1["party"].replace("democratic-farmer-labor", "democrat") 

In [142]:
df1["party"]= df1["party"].replace("socialist workers", "democrat") 

In [143]:
df1["party"]= df1["party"].replace("foglietta (democrat)", "democrat") 

In [144]:
df1["party"]= df1["party"].replace("independent-republican", "republican") 

In [145]:
df1["party"]= df1["party"].replace("libertarian", "republican") 

In [146]:
df1["party"]= df1["party"].replace("conservative", "republican") 

In [147]:
df1.loc[df1["candidate"] == 'Bernie Sanders', 'party'] = 'democrat'
df1.loc[df1["candidate"] == 'Bernard Sanders', 'party'] = 'democrat'
df1.loc[df1["candidate"] == 'Jo Ann Emerson', 'party'] = 'republican'
df1.loc[df1["candidate"] == 'Virgil H. Goode, Jr.', 'party'] = 'republican'

In [148]:
df1.loc[df1["candidate"] == 'Thomas G. Tancredo', 'party'] = 'republican'
df1.loc[df1["candidate"] == 'Mo Brooks', 'party'] = 'republican'
df1.loc[df1["candidate"] == 'Christopher H. Smith', 'party'] = 'republican'
df1.loc[df1["candidate"] == 'Mike Gallagher', 'party'] = 'republican'

In [151]:
df1.loc[df1["candidate"] == 'Frederica S. Wilson', 'totalvotes'] = 1

In [152]:
df1.party.value_counts()

republican                          11995
democrat                            10024
independent                          1104
natural law                           371
working families                      245
independence                          229
right to life                         225
reform                                173
peace and freedom                     165
constitution                          140
american independent                  104
american                               72
none                                   67
u.s. taxpayers                         63
no party affiliation                   63
independent american                   47
u.s. labor                             36
women's equality                       32
other                                  26
citizens                               25
socialist                              24
populist                               23
new alliance                           22
freedom                           

In [153]:
df1.columns

Index(['year', 'state', 'stage', 'runoff', 'special', 'candidate', 'party',
       'writein', 'candidatevotes', 'totalvotes', 'Unnamed: 19', 'district_id',
       'yr_district_id'],
      dtype='object')

In [154]:
dem_vote_df = df1[df1.party == 'democrat']

In [155]:
def func(x):
     return pd.Series(dict(total_dem_votes = x['candidatevotes'].sum()))

In [156]:
dem_vote = pd.DataFrame(dem_vote_df.groupby(['yr_district_id']).apply(func))

In [157]:
dem_vote.reset_index(inplace=True)

In [158]:
len(dem_vote)

9079

In [159]:
dem_vote.head()

Unnamed: 0,yr_district_id,total_dem_votes
0,1976-AK-00,34194
1,1976-AL-01,58906
2,1976-AL-02,66288
3,1976-AL-03,106935
4,1976-AL-04,141490


In [160]:
winner_df

Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare
0,1976-AK-00,83722,118208,1976,Alaska,Don Young,republican,False,AK-00,0.708260
1,1976-AL-01,98257,157170,1976,Alabama,Jack Edwards,republican,False,AL-01,0.625164
2,1976-AL-02,90069,156362,1976,Alabama,"William L. """"Bill"""" Dickinson",republican,False,AL-02,0.576029
3,1976-AL-03,106935,108048,1976,Alabama,Bill Nichols,democrat,False,AL-03,0.989699
4,1976-AL-04,141490,176022,1976,Alabama,Tom Bevill,democrat,False,AL-04,0.803820
5,1976-AL-05,113553,113560,1976,Alabama,Ronnie G. Flippo,democrat,False,AL-05,0.999938
6,1976-AL-06,92113,162518,1976,Alabama,"John H. Buchanan, Jr.",republican,False,AL-06,0.566786
7,1976-AL-07,110496,110501,1976,Alabama,Walter Flowers,democrat,False,AL-07,0.999955
8,1976-AR-01,116217,168782,1976,Arkansas,Bill Alexander,democrat,False,AR-01,0.688563
9,1976-AR-02,144780,167607,1976,Arkansas,Jim Guy Tucker,democrat,False,AR-02,0.863806


In [495]:
wd_df = pd.merge(winner_df, dem_vote, on=['yr_district_id'], how='left')
wd_df

Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare,total_dem_votes
0,1976-AK-00,83722,118208,1976,Alaska,Don Young,republican,False,AK-00,0.708260,34194.0
1,1976-AL-01,98257,157170,1976,Alabama,Jack Edwards,republican,False,AL-01,0.625164,58906.0
2,1976-AL-02,90069,156362,1976,Alabama,"William L. """"Bill"""" Dickinson",republican,False,AL-02,0.576029,66288.0
3,1976-AL-03,106935,108048,1976,Alabama,Bill Nichols,democrat,False,AL-03,0.989699,106935.0
4,1976-AL-04,141490,176022,1976,Alabama,Tom Bevill,democrat,False,AL-04,0.803820,141490.0
5,1976-AL-05,113553,113560,1976,Alabama,Ronnie G. Flippo,democrat,False,AL-05,0.999938,113553.0
6,1976-AL-06,92113,162518,1976,Alabama,"John H. Buchanan, Jr.",republican,False,AL-06,0.566786,69384.0
7,1976-AL-07,110496,110501,1976,Alabama,Walter Flowers,democrat,False,AL-07,0.999955,110496.0
8,1976-AR-01,116217,168782,1976,Arkansas,Bill Alexander,democrat,False,AR-01,0.688563,116217.0
9,1976-AR-02,144780,167607,1976,Arkansas,Jim Guy Tucker,democrat,False,AR-02,0.863806,144780.0


In [180]:
rep_vote_df = df1[df1.party == 'republican']

In [181]:
len(rep_vote_df)

11995

In [182]:
def rfunc(x):
     return pd.Series(dict(total_rep_votes = x['candidatevotes'].sum()))

In [183]:
rep_vote = pd.DataFrame(rep_vote_df.groupby(['yr_district_id']).apply(rfunc))

In [184]:
rep_vote.reset_index(inplace=True)

In [185]:
len(rep_vote)

8931

In [186]:
rep_vote.head()

Unnamed: 0,yr_district_id,total_rep_votes
0,1976-AK-00,83722
1,1976-AL-01,98257
2,1976-AL-02,90069
3,1976-AL-04,34531
4,1976-AL-06,92113


In [497]:
votes_df = pd.merge(wd_df, rep_vote, on=['yr_district_id'], how='left')
votes_df

Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare,total_dem_votes,total_rep_votes
0,1976-AK-00,83722,118208,1976,Alaska,Don Young,republican,False,AK-00,0.708260,34194.0,83722.0
1,1976-AL-01,98257,157170,1976,Alabama,Jack Edwards,republican,False,AL-01,0.625164,58906.0,98257.0
2,1976-AL-02,90069,156362,1976,Alabama,"William L. """"Bill"""" Dickinson",republican,False,AL-02,0.576029,66288.0,90069.0
3,1976-AL-03,106935,108048,1976,Alabama,Bill Nichols,democrat,False,AL-03,0.989699,106935.0,
4,1976-AL-04,141490,176022,1976,Alabama,Tom Bevill,democrat,False,AL-04,0.803820,141490.0,34531.0
5,1976-AL-05,113553,113560,1976,Alabama,Ronnie G. Flippo,democrat,False,AL-05,0.999938,113553.0,
6,1976-AL-06,92113,162518,1976,Alabama,"John H. Buchanan, Jr.",republican,False,AL-06,0.566786,69384.0,92113.0
7,1976-AL-07,110496,110501,1976,Alabama,Walter Flowers,democrat,False,AL-07,0.999955,110496.0,
8,1976-AR-01,116217,168782,1976,Arkansas,Bill Alexander,democrat,False,AR-01,0.688563,116217.0,52565.0
9,1976-AR-02,144780,167607,1976,Arkansas,Jim Guy Tucker,democrat,False,AR-02,0.863806,144780.0,22819.0


In [509]:
votes_df.total_dem_votes.fillna(0, inplace=True)

In [510]:
votes_df.total_rep_votes.fillna(0, inplace=True)

In [511]:
votes_df['total_dem_vote_share'] = votes_df.total_dem_votes/ votes_df.totalvotes

In [512]:
votes_df['total_rep_vote_share'] = votes_df.total_rep_votes/ votes_df.totalvotes

In [513]:
votes_df.head()

Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare,total_dem_votes,total_rep_votes,total_dem_vote_share,total_rep_vote_share
0,1976-AK-00,83722,118208,1976,Alaska,Don Young,republican,False,AK-00,0.70826,34194.0,83722.0,0.28927,0.70826
1,1976-AL-01,98257,157170,1976,Alabama,Jack Edwards,republican,False,AL-01,0.625164,58906.0,98257.0,0.374792,0.625164
2,1976-AL-02,90069,156362,1976,Alabama,"William L. """"Bill"""" Dickinson",republican,False,AL-02,0.576029,66288.0,90069.0,0.423939,0.576029
3,1976-AL-03,106935,108048,1976,Alabama,Bill Nichols,democrat,False,AL-03,0.989699,106935.0,0.0,0.989699,0.0
4,1976-AL-04,141490,176022,1976,Alabama,Tom Bevill,democrat,False,AL-04,0.80382,141490.0,34531.0,0.80382,0.196174


In [514]:
votes_df.loc[votes_df["winner"] == 'Frederica S. Wilson']


Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare,total_dem_votes,total_rep_votes,total_dem_vote_share,total_rep_vote_share
7480,2010-FL-17,106361,123370,2010,Florida,Frederica S. Wilson,democrat,False,FL-17,0.86213,106361.0,0.0,0.86213,0.0
7923,2012-FL-24,1,1,2012,Florida,Frederica S. Wilson,democrat,False,FL-24,1.0,1.0,0.0,1.0,0.0
8358,2014-FL-24,129192,149918,2014,Florida,Frederica S. Wilson,democrat,False,FL-24,0.861751,129192.0,15239.0,0.861751,0.101649
8793,2016-FL-24,1,1,2016,Florida,Frederica S. Wilson,democrat,False,FL-24,1.0,0.0,0.0,0.0,0.0


In [515]:
votes_df.isna().sum()

yr_district_id          0
winner_votes            0
totalvotes              0
year                    0
state                   0
winner                  0
winner_party            0
writein                 0
district_id             0
winner_voteshare        0
total_dem_votes         0
total_rep_votes         0
total_dem_vote_share    0
total_rep_vote_share    0
dtype: int64

In [516]:
votes_df.to_csv('HouseVotesbyDistrict_76_18.csv', index=False)

## Feature Engineering

In [47]:
votes_df = pd.read_csv('HouseVotesbyDistrict_76_18_0814.csv')

In [48]:
votes_df.head(100)

Unnamed: 0.1,Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,winner_voteshare,total_dem_votes,total_rep_votes,total_dem_vote_share,total_rep_vote_share
0,0,1976-AK-00,83722,118208,1976,Alaska,Don Young,republican,False,AK-00,0.708260,34194,83722,0.289270,0.708260
1,435,1978-AK-00,68811,124187,1978,Alaska,Don Young,republican,False,AK-00,0.554092,55176,68811,0.444298,0.554092
2,870,1980-AK-00,114089,154618,1980,Alaska,Don Young,republican,False,AK-00,0.737877,39922,114089,0.258198,0.737877
3,1305,1982-AK-00,128274,181084,1982,Alaska,Don Young,republican,False,AK-00,0.708367,52011,128274,0.287220,0.708367
4,1740,1984-AK-00,113582,206437,1984,Alaska,Don Young,republican,False,AK-00,0.550202,86052,113582,0.416844,0.550202
5,2175,1986-AK-00,101799,180277,1986,Alaska,Don Young,republican,False,AK-00,0.564681,74053,105981,0.410773,0.587879
6,2610,1988-AK-00,120595,192955,1988,Alaska,Don Young,republican,False,AK-00,0.624990,71881,120595,0.372527,0.624990
7,3045,1990-AK-00,99003,191647,1990,Alaska,Don Young,republican,False,AK-00,0.516590,91677,99003,0.478364,0.516590
8,3480,1992-AK-00,111849,239116,1992,Alaska,Don Young,republican,False,AK-00,0.467760,111907,111849,0.468003,0.467760
9,3915,1994-AK-00,118537,208240,1994,Alaska,Don Young,republican,False,AK-00,0.569233,89449,118537,0.429548,0.569233


In [49]:
votes_df['winner'] = votes_df['winner'].map(lambda x: re.sub(r'\W+', '', x))

In [50]:
votes_df.sort_values(['district_id', 'year'], inplace=True)

In [51]:
votes_df['L1_winner']= votes_df.winner.shift()

In [52]:
votes_df['L2_winner']= votes_df.winner.shift(2)

In [53]:
votes_df['L3_winner']= votes_df.winner.shift(3)

In [54]:
votes_df['L4_winner']= votes_df.winner.shift(4)

In [55]:
votes_df['L5_winner']= votes_df.winner.shift(5)

In [56]:
votes_df['dL1_winner'] = 0
votes_df.loc[votes_df['winner'] == votes_df['L1_winner'], 'dL1_winner']= 1

In [57]:
votes_df['dL2_winner'] = 0
votes_df.loc[votes_df['winner'] == votes_df['L2_winner'], 'dL2_winner']= 1

In [58]:
votes_df['dL3_winner'] = 0
votes_df.loc[votes_df['winner'] == votes_df['L3_winner'], 'dL3_winner']= 1

In [59]:
votes_df['dL4_winner'] = 0
votes_df.loc[votes_df['winner'] == votes_df['L4_winner'], 'dL4_winner']= 1

In [60]:
votes_df['dL5_winner'] = 0
votes_df.loc[votes_df['winner'] == votes_df['L5_winner'], 'dL5_winner']= 1

In [61]:
votes_df['incumbent_party'] = votes_df.winner_party.shift()

In [62]:
votes_df['incumbent_L5_races'] = votes_df['dL1_winner'] + votes_df['dL2_winner'] + \
    votes_df['dL3_winner'] + votes_df['dL4_winner'] + votes_df['dL5_winner']

In [63]:
votes_df['incumbent_L4_races'] = votes_df['dL1_winner'] + votes_df['dL2_winner'] + \
    votes_df['dL3_winner'] + votes_df['dL4_winner'] 

In [64]:
votes_df['incumbent_L3_races'] = votes_df['dL1_winner'] + votes_df['dL2_winner'] + \
    votes_df['dL3_winner'] 

In [65]:
votes_df['incumbent_L2_races'] = votes_df['dL1_winner'] + votes_df['dL2_winner']

In [66]:
votes_df['rep_L1_wins'] = 0
votes_df.loc[votes_df['incumbent_party'] == 'republican', 'rep_L1_wins']= 1

In [67]:
votes_df['rep_L5_wins'] = votes_df.rep_L1_wins.rolling(window=5).sum()

In [68]:
votes_df['rep_L4_wins'] = votes_df.rep_L1_wins.rolling(window=4).sum()

In [69]:
votes_df['rep_L3_wins'] = votes_df.rep_L1_wins.rolling(window=3).sum()

In [70]:
votes_df['rep_L2_wins'] = votes_df.rep_L1_wins.rolling(window=2).sum()

In [71]:
votes_df['dem_L1_wins'] = 0
votes_df.loc[votes_df['incumbent_party'] == 'democrat', 'dem_L1_wins']= 1

In [72]:
votes_df['dem_L5_wins'] = votes_df.dem_L1_wins.rolling(window=5).sum()

In [73]:
votes_df['dem_L4_wins'] = votes_df.dem_L1_wins.rolling(window=4).sum()

In [74]:
votes_df['dem_L3_wins'] = votes_df.dem_L1_wins.rolling(window=3).sum()

In [75]:
votes_df['dem_L2_wins'] = votes_df.dem_L1_wins.rolling(window=2).sum()

In [76]:
votes_df['rep_L1_voteshare']= votes_df.total_rep_vote_share.shift()

In [77]:
votes_df['rep_L5_voteshare'] = votes_df.rep_L1_voteshare.rolling(window=5).mean()

In [78]:
votes_df['rep_L4_voteshare'] = votes_df.rep_L1_voteshare.rolling(window=4).mean()

In [79]:
votes_df['rep_L3_voteshare'] = votes_df.rep_L1_voteshare.rolling(window=3).mean()

In [80]:
votes_df['rep_L2_voteshare'] = votes_df.rep_L1_voteshare.rolling(window=2).mean()

In [81]:
votes_df['dem_L1_voteshare']= votes_df.total_dem_vote_share.shift()

In [82]:
votes_df['dem_L5_voteshare'] = votes_df.dem_L1_voteshare.rolling(window=5).mean()

In [83]:
votes_df['dem_L4_voteshare'] = votes_df.dem_L1_voteshare.rolling(window=4).mean()

In [84]:
votes_df['dem_L3_voteshare'] = votes_df.dem_L1_voteshare.rolling(window=3).mean()

In [85]:
votes_df['dem_L2_voteshare'] = votes_df.dem_L1_voteshare.rolling(window=2).mean()

In [86]:
votes_df['target'] = 0
votes_df.loc[votes_df['winner_party'] == 'democrat', 'target']= 1

In [87]:
votes_df['year'] = votes_df.year.apply(int)

In [88]:
votes_df_2012_2018 = votes_df[votes_df.year > 2010]

In [89]:
votes_df_2012_2018.target.value_counts()

0    918
1    822
Name: target, dtype: int64

In [161]:
dem_vote_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10024 entries, 0 to 29632
Data columns (total 13 columns):
year              10024 non-null int64
state             10024 non-null object
stage             10024 non-null object
runoff            7168 non-null object
special           10024 non-null bool
candidate         10024 non-null object
party             10024 non-null object
writein           10024 non-null bool
candidatevotes    10024 non-null int64
totalvotes        10024 non-null int64
Unnamed: 19       17 non-null float64
district_id       10024 non-null object
yr_district_id    10024 non-null object
dtypes: bool(2), float64(1), int64(3), object(7)
memory usage: 959.3+ KB


In [162]:
dem_vote_df.year = dem_vote_df.year.astype(float).fillna(0.0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [163]:
dem_vote_1218_df = dem_vote_df[dem_vote_df.year > 2010]
dem_vote_1218_df

Unnamed: 0,year,state,stage,runoff,special,candidate,party,writein,candidatevotes,totalvotes,Unnamed: 19,district_id,yr_district_id
24055,2012.0,Alabama,gen,,False,Therese Ford,democrat,False,103092,283953,,AL-02,2012-AL-02
24059,2012.0,Alabama,gen,,False,John Andrew Harris,democrat,False,98141,273930,,AL-03,2012-AL-03
24061,2012.0,Alabama,gen,,False,Daniel H. Boman,democrat,False,69706,269118,,AL-04,2012-AL-04
24065,2012.0,Alabama,gen,,False,Charlie L. Holley,democrat,False,101772,291293,,AL-05,2012-AL-05
24067,2012.0,Alabama,gen,,False,Penny H. Bailey,democrat,False,88267,308102,,AL-06,2012-AL-06
24071,2012.0,Alabama,gen,,False,Terri A. Sewell,democrat,False,232520,306558,,AL-07,2012-AL-07
24077,2012.0,Alaska,gen,,False,Sharon M. Cissna,democrat,False,82927,289804,,AK-00,2012-AK-00
24079,2012.0,Arizona,gen,,False,Ann Kirkpatrick,democrat,False,122774,251595,,AZ-01,2012-AZ-01
24082,2012.0,Arizona,gen,,False,Ron Barber,democrat,False,147338,292279,,AZ-02,2012-AZ-02
24084,2012.0,Arizona,gen,,False,Ra_l M. Grijalva,democrat,False,98468,168698,,AZ-03,2012-AZ-03


In [164]:
top_dem_vote = pd.DataFrame(dem_vote_1218_df.groupby(['yr_district_id']).candidatevotes.max())

In [165]:
top_dem_vote.reset_index(inplace=True)

In [166]:
len(top_dem_vote)

1654

In [167]:
top_dem_vote.columns

Index(['yr_district_id', 'candidatevotes'], dtype='object')

In [168]:
df.columns

Index(['year', 'state', 'stage', 'runoff', 'special', 'candidate', 'party',
       'writein', 'candidatevotes', 'totalvotes', 'Unnamed: 19', 'district_id',
       'yr_district_id'],
      dtype='object')

In [174]:
top_dem_df = pd.merge(top_dem_vote, df, on=['yr_district_id', 'candidatevotes'], how='left')
top_dem_df

Unnamed: 0,yr_district_id,candidatevotes,year,state,stage,runoff,special,candidate,party,writein,totalvotes,Unnamed: 19,district_id
0,2012-AK-00,82927,2012,Alaska,gen,,False,Sharon M. Cissna,democrat,False,289804,,AK-00
1,2012-AL-02,103092,2012,Alabama,gen,,False,Therese Ford,democrat,False,283953,,AL-02
2,2012-AL-03,98141,2012,Alabama,gen,,False,John Andrew Harris,democrat,False,273930,,AL-03
3,2012-AL-04,69706,2012,Alabama,gen,,False,Daniel H. Boman,democrat,False,269118,,AL-04
4,2012-AL-05,101772,2012,Alabama,gen,,False,Charlie L. Holley,democrat,False,291293,,AL-05
5,2012-AL-06,88267,2012,Alabama,gen,,False,Penny H. Bailey,democrat,False,308102,,AL-06
6,2012-AL-07,232520,2012,Alabama,gen,,False,Terri A. Sewell,democrat,False,306558,,AL-07
7,2012-AR-01,96601,2012,Arkansas,gen,,False,Scott Ellington,democrat,False,246843,,AR-01
8,2012-AR-02,113156,2012,Arkansas,gen,,False,Herb Rule,democrat,False,286598,,AR-02
9,2012-AR-03,39318,2012,Arkansas,gen,,False,Rebekah J. Kennedy,green,False,245660,,AR-03


In [175]:
top_dem_df.drop(['year', 'state', 'stage', 'runoff', 'special', 'party',
                 'writein', 'totalvotes', 'district_id', 'Unnamed: 19'], axis=1, inplace=True)

In [176]:
top_dem_df.head()

Unnamed: 0,yr_district_id,candidatevotes,candidate
0,2012-AK-00,82927,Sharon M. Cissna
1,2012-AL-02,103092,Therese Ford
2,2012-AL-03,98141,John Andrew Harris
3,2012-AL-04,69706,Daniel H. Boman
4,2012-AL-05,101772,Charlie L. Holley


In [177]:
top_dem_df.columns = ['yr_district_id', 'top_dem_votes', 'top_dem_candidate']

In [178]:
votes_df_2012_2018_d = pd.merge(votes_df_2012_2018, top_dem_df, on=['yr_district_id'], how='left')
votes_df_2012_2018_d

Unnamed: 0.1,Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,...,rep_L3_voteshare,rep_L2_voteshare,dem_L1_voteshare,dem_L5_voteshare,dem_L4_voteshare,dem_L3_voteshare,dem_L2_voteshare,target,top_dem_votes,top_dem_candidate
0,7813,2012-AK-00,185296,289804,2012,Alaska,DonYoung,republican,False,AK-00,...,0.591296,0.595499,0.305133,0.332127,3.561047e-01,3.875740e-01,0.377440,0,82927.0,Sharon M. Cissna
1,8248,2014-AK-00,142572,279741,2014,Alaska,DonYoung,republican,False,AK-00,...,0.627413,0.690409,0.286149,0.342113,3.622176e-01,3.470096e-01,0.295641,0,114602.0,Forrest Dunbar
2,8683,2016-AK-00,155088,308198,2016,Alaska,DonYoung,republican,False,AK-00,...,0.655527,0.638501,0.409672,0.371708,3.626752e-01,3.336511e-01,0.347910,0,111019.0,Steve Lindbeck
3,9118,2018-AK-00,149779,282166,2018,Alaska,DonYoung,republican,False,AK-00,...,0.627765,0.596028,0.360220,0.362184,3.402933e-01,3.520134e-01,0.384946,0,131199.0,Alyse S. Galvin
4,7814,2012-AL-01,196374,200676,2012,Alabama,JoBonner,republican,False,AL-01,...,0.829861,0.904273,0.000000,0.212828,1.715860e-01,1.060654e-01,0.000000,0,,
5,8249,2014-AL-01,103758,152234,2014,Alabama,BradleyByrne,republican,False,AL-01,...,0.929036,0.902201,0.000000,0.137269,7.954909e-02,0.000000e+00,0.000000,0,48278.0,Burton R. LeFlore
6,8684,2016-AL-01,208083,215893,2016,Alabama,BradleyByrne,republican,False,AL-01,...,0.828657,0.830066,0.317130,0.127065,7.928255e-02,1.057101e-01,0.158565,0,,
7,9119,2018-AL-01,153228,242617,2018,Alabama,BradleyByrne,republican,False,AL-01,...,0.874652,0.822697,0.000000,0.063426,7.928255e-02,1.057101e-01,0.158565,0,89226.0,Robert Kennedy Jr.
8,7815,2012-AL-02,180591,283953,2012,Alabama,MarthaRoby,republican,False,AL-02,...,0.566858,0.502918,0.487906,0.374879,3.947883e-01,4.315227e-01,0.495120,0,103092.0,Therese Ford
9,8250,2014-AL-02,113103,167952,2014,Alabama,MarthaRoby,republican,False,AL-02,...,0.547275,0.572859,0.363060,0.388443,4.144070e-01,4.511002e-01,0.425483,0,54692.0,Erick Wright


In [187]:
rep_vote_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11995 entries, 1 to 29633
Data columns (total 13 columns):
year              11995 non-null int64
state             11995 non-null object
stage             11995 non-null object
runoff            8563 non-null object
special           11995 non-null bool
candidate         11995 non-null object
party             11995 non-null object
writein           11995 non-null bool
candidatevotes    11995 non-null int64
totalvotes        11995 non-null int64
Unnamed: 19       16 non-null float64
district_id       11995 non-null object
yr_district_id    11995 non-null object
dtypes: bool(2), float64(1), int64(3), object(7)
memory usage: 1.1+ MB


In [188]:
rep_vote_df.year = rep_vote_df.year.astype(float).fillna(0.0)


In [189]:
rep_vote_1218_df = rep_vote_df[rep_vote_df.year > 2010]
rep_vote_1218_df

Unnamed: 0,year,state,stage,runoff,special,candidate,party,writein,candidatevotes,totalvotes,Unnamed: 19,district_id,yr_district_id
24054,2012.0,Alabama,gen,,False,Jo Bonner,republican,False,196374,200676,,AL-01,2012-AL-01
24057,2012.0,Alabama,gen,,False,Martha Roby,republican,False,180591,283953,,AL-02,2012-AL-02
24058,2012.0,Alabama,gen,,False,Mike Rogers,republican,False,175306,273930,,AL-03,2012-AL-03
24063,2012.0,Alabama,gen,,False,Robert B. Aderholt,republican,False,199071,269118,,AL-04,2012-AL-04
24066,2012.0,Alabama,gen,,False,Mo Brooks,republican,False,189185,291293,,AL-05,2012-AL-05
24069,2012.0,Alabama,gen,,False,Spencer Bachus,republican,False,219262,308102,,AL-06,2012-AL-06
24070,2012.0,Alabama,gen,,False,Don Chamberlain,republican,False,73835,306558,,AL-07,2012-AL-07
24073,2012.0,Alaska,gen,,False,Jim C. McDermott,republican,False,15028,289804,,AK-00,2012-AK-00
24075,2012.0,Alaska,gen,,False,Don Young,republican,False,185296,289804,,AK-00,2012-AK-00
24078,2012.0,Arizona,gen,,False,Kim Allen,republican,False,15227,251595,,AZ-01,2012-AZ-01


In [190]:
top_rep_vote = pd.DataFrame(rep_vote_1218_df.groupby(['yr_district_id']).candidatevotes.max())

In [191]:
top_rep_vote.reset_index(inplace=True)

In [192]:
len(top_rep_vote)

1637

In [193]:
top_rep_vote.columns

Index(['yr_district_id', 'candidatevotes'], dtype='object')

In [194]:
df.columns

Index(['year', 'state', 'stage', 'runoff', 'special', 'candidate', 'party',
       'writein', 'candidatevotes', 'totalvotes', 'Unnamed: 19', 'district_id',
       'yr_district_id'],
      dtype='object')

In [195]:
top_rep_df = pd.merge(top_rep_vote, df, on=['yr_district_id', 'candidatevotes'], how='left')
top_rep_df

Unnamed: 0,yr_district_id,candidatevotes,year,state,stage,runoff,special,candidate,party,writein,totalvotes,Unnamed: 19,district_id
0,2012-AK-00,185296,2012,Alaska,gen,,False,Don Young,republican,False,289804,,AK-00
1,2012-AL-01,196374,2012,Alabama,gen,,False,Jo Bonner,republican,False,200676,,AL-01
2,2012-AL-02,180591,2012,Alabama,gen,,False,Martha Roby,republican,False,283953,,AL-02
3,2012-AL-03,175306,2012,Alabama,gen,,False,Mike Rogers,republican,False,273930,,AL-03
4,2012-AL-04,199071,2012,Alabama,gen,,False,Robert B. Aderholt,republican,False,269118,,AL-04
5,2012-AL-05,189185,2012,Alabama,gen,,False,Mo Brooks,republican,False,291293,,AL-05
6,2012-AL-06,219262,2012,Alabama,gen,,False,Spencer Bachus,republican,False,308102,,AL-06
7,2012-AL-07,73835,2012,Alabama,gen,,False,Don Chamberlain,republican,False,306558,,AL-07
8,2012-AR-01,138800,2012,Arkansas,gen,,False,"Eric A. """"Rick"""" Crawford",republican,False,246843,,AR-01
9,2012-AR-02,158175,2012,Arkansas,gen,,False,Tim Griffin,republican,False,286598,,AR-02


In [197]:
top_rep_df.drop(['year', 'state', 'stage', 'runoff', 'special', 'party',
                 'writein', 'totalvotes', 'district_id', 'Unnamed: 19'], axis=1, inplace=True)

In [198]:
top_rep_df.head()

Unnamed: 0,yr_district_id,candidatevotes,candidate
0,2012-AK-00,185296,Don Young
1,2012-AL-01,196374,Jo Bonner
2,2012-AL-02,180591,Martha Roby
3,2012-AL-03,175306,Mike Rogers
4,2012-AL-04,199071,Robert B. Aderholt


In [199]:
top_rep_df.columns = ['yr_district_id', 'top_rep_votes', 'top_rep_candidate']

In [200]:
votes_df_2012_2018_dr = pd.merge(votes_df_2012_2018_d, top_rep_df, on=['yr_district_id'], how='left')
votes_df_2012_2018_dr

Unnamed: 0.1,Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,...,dem_L1_voteshare,dem_L5_voteshare,dem_L4_voteshare,dem_L3_voteshare,dem_L2_voteshare,target,top_dem_votes,top_dem_candidate,top_rep_votes,top_rep_candidate
0,7813,2012-AK-00,185296,289804,2012,Alaska,DonYoung,republican,False,AK-00,...,0.305133,0.332127,3.561047e-01,3.875740e-01,0.377440,0,82927.0,Sharon M. Cissna,185296.0,Don Young
1,8248,2014-AK-00,142572,279741,2014,Alaska,DonYoung,republican,False,AK-00,...,0.286149,0.342113,3.622176e-01,3.470096e-01,0.295641,0,114602.0,Forrest Dunbar,142572.0,Don Young
2,8683,2016-AK-00,155088,308198,2016,Alaska,DonYoung,republican,False,AK-00,...,0.409672,0.371708,3.626752e-01,3.336511e-01,0.347910,0,111019.0,Steve Lindbeck,155088.0,Don Young
3,9118,2018-AK-00,149779,282166,2018,Alaska,DonYoung,republican,False,AK-00,...,0.360220,0.362184,3.402933e-01,3.520134e-01,0.384946,0,131199.0,Alyse S. Galvin,149779.0,Don Young
4,7814,2012-AL-01,196374,200676,2012,Alabama,JoBonner,republican,False,AL-01,...,0.000000,0.212828,1.715860e-01,1.060654e-01,0.000000,0,,,196374.0,Jo Bonner
5,8249,2014-AL-01,103758,152234,2014,Alabama,BradleyByrne,republican,False,AL-01,...,0.000000,0.137269,7.954909e-02,0.000000e+00,0.000000,0,48278.0,Burton R. LeFlore,103758.0,Bradley Byrne
6,8684,2016-AL-01,208083,215893,2016,Alabama,BradleyByrne,republican,False,AL-01,...,0.317130,0.127065,7.928255e-02,1.057101e-01,0.158565,0,,,208083.0,Bradley Byrne
7,9119,2018-AL-01,153228,242617,2018,Alabama,BradleyByrne,republican,False,AL-01,...,0.000000,0.063426,7.928255e-02,1.057101e-01,0.158565,0,89226.0,Robert Kennedy Jr.,153228.0,Bradley Byrne
8,7815,2012-AL-02,180591,283953,2012,Alabama,MarthaRoby,republican,False,AL-02,...,0.487906,0.374879,3.947883e-01,4.315227e-01,0.495120,0,103092.0,Therese Ford,180591.0,Martha Roby
9,8250,2014-AL-02,113103,167952,2014,Alabama,MarthaRoby,republican,False,AL-02,...,0.363060,0.388443,4.144070e-01,4.511002e-01,0.425483,0,54692.0,Erick Wright,113103.0,Martha Roby


In [201]:
votes_df_2012_2018_dr['top_dem_candidate'].fillna('None', inplace=True)
votes_df_2012_2018_dr['top_rep_candidate'].fillna('None', inplace=True)

In [202]:
votes_df_2012_2018_dr['top_dem_candidate'] = votes_df_2012_2018_dr['top_dem_candidate'].map(lambda x: re.sub(r'\W+', '', x))
votes_df_2012_2018_dr['top_rep_candidate'] = votes_df_2012_2018_dr['top_rep_candidate'].map(lambda x: re.sub(r'\W+', '', x))


In [204]:
votes_df_2012_2018_dr.to_csv('HouseVotesFeatures_12_18_2.csv', index= False)

In [205]:
votes_df_2012_2018_dr = pd.read_csv('HouseVotesFeatures_12_18_2.csv')

In [206]:
votes_df_2012_2018_dr

Unnamed: 0.1,Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,...,dem_L1_voteshare,dem_L5_voteshare,dem_L4_voteshare,dem_L3_voteshare,dem_L2_voteshare,target,top_dem_votes,top_dem_candidate,top_rep_votes,top_rep_candidate
0,7813,2012-AK-00,185296,289804,2012,Alaska,DonYoung,republican,False,AK-00,...,0.305133,0.332127,3.561047e-01,3.875740e-01,0.377440,0,82927.0,SharonMCissna,185296.0,DonYoung
1,8248,2014-AK-00,142572,279741,2014,Alaska,DonYoung,republican,False,AK-00,...,0.286149,0.342113,3.622176e-01,3.470096e-01,0.295641,0,114602.0,ForrestDunbar,142572.0,DonYoung
2,8683,2016-AK-00,155088,308198,2016,Alaska,DonYoung,republican,False,AK-00,...,0.409672,0.371708,3.626752e-01,3.336511e-01,0.347910,0,111019.0,SteveLindbeck,155088.0,DonYoung
3,9118,2018-AK-00,149779,282166,2018,Alaska,DonYoung,republican,False,AK-00,...,0.360220,0.362184,3.402933e-01,3.520134e-01,0.384946,0,131199.0,AlyseSGalvin,149779.0,DonYoung
4,7814,2012-AL-01,196374,200676,2012,Alabama,JoBonner,republican,False,AL-01,...,0.000000,0.212828,1.715860e-01,1.060654e-01,0.000000,0,,,196374.0,JoBonner
5,8249,2014-AL-01,103758,152234,2014,Alabama,BradleyByrne,republican,False,AL-01,...,0.000000,0.137269,7.954909e-02,0.000000e+00,0.000000,0,48278.0,BurtonRLeFlore,103758.0,BradleyByrne
6,8684,2016-AL-01,208083,215893,2016,Alabama,BradleyByrne,republican,False,AL-01,...,0.317130,0.127065,7.928255e-02,1.057101e-01,0.158565,0,,,208083.0,BradleyByrne
7,9119,2018-AL-01,153228,242617,2018,Alabama,BradleyByrne,republican,False,AL-01,...,0.000000,0.063426,7.928255e-02,1.057101e-01,0.158565,0,89226.0,RobertKennedyJr,153228.0,BradleyByrne
8,7815,2012-AL-02,180591,283953,2012,Alabama,MarthaRoby,republican,False,AL-02,...,0.487906,0.374879,3.947883e-01,4.315227e-01,0.495120,0,103092.0,ThereseFord,180591.0,MarthaRoby
9,8250,2014-AL-02,113103,167952,2014,Alabama,MarthaRoby,republican,False,AL-02,...,0.363060,0.388443,4.144070e-01,4.511002e-01,0.425483,0,54692.0,ErickWright,113103.0,MarthaRoby


In [207]:
votes_df_2012_2018_dr['dem_incumbent_in_race'] = 0
votes_df_2012_2018_dr.loc[votes_df_2012_2018_dr['top_dem_candidate']
                          == votes_df_2012_2018_dr['L1_winner'], 'dem_incumbent_in_race'] = 1

In [208]:
votes_df_2012_2018_dr['rep_incumbent_in_race'] = 0
votes_df_2012_2018_dr.loc[votes_df_2012_2018_dr['top_rep_candidate']
                          == votes_df_2012_2018_dr['L1_winner'], 'rep_incumbent_in_race'] = 1

In [210]:
votes_df_2012_2018_dr.columns

Index(['Unnamed: 0', 'yr_district_id', 'winner_votes', 'totalvotes', 'year',
       'state', 'winner', 'winner_party', 'writein', 'district_id',
       'winner_voteshare', 'total_dem_votes', 'total_rep_votes',
       'total_dem_vote_share', 'total_rep_vote_share', 'L1_winner',
       'L2_winner', 'L3_winner', 'L4_winner', 'L5_winner', 'dL1_winner',
       'dL2_winner', 'dL3_winner', 'dL4_winner', 'dL5_winner',
       'incumbent_party', 'incumbent_L5_races', 'incumbent_L4_races',
       'incumbent_L3_races', 'incumbent_L2_races', 'rep_L1_wins',
       'rep_L5_wins', 'rep_L4_wins', 'rep_L3_wins', 'rep_L2_wins',
       'dem_L1_wins', 'dem_L5_wins', 'dem_L4_wins', 'dem_L3_wins',
       'dem_L2_wins', 'rep_L1_voteshare', 'rep_L5_voteshare',
       'rep_L4_voteshare', 'rep_L3_voteshare', 'rep_L2_voteshare',
       'dem_L1_voteshare', 'dem_L5_voteshare', 'dem_L4_voteshare',
       'dem_L3_voteshare', 'dem_L2_voteshare', 'target', 'top_dem_votes',
       'top_dem_candidate', 'top_rep_votes', '

In [214]:
votes_df_2012_2018_dr['flipped']=0
votes_df_2012_2018_dr.loc[votes_df_2012_2018_dr['winner_party']
                          != votes_df_2012_2018_dr['incumbent_party'], 'flipped'] = 1

In [216]:
votes_df_2012_2018_dr.flipped.sum()

157

In [217]:
votes_df_2012_2018_dr.flipped.sum()/len(votes_df_2012_2018_dr)

0.09022988505747126

In [222]:
flipped_df = votes_df_2012_2018_dr[votes_df_2012_2018_dr['flipped']==1]
flipped_df

Unnamed: 0.1,Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,...,dem_L3_voteshare,dem_L2_voteshare,target,top_dem_votes,top_dem_candidate,top_rep_votes,top_rep_candidate,dem_incumbent_in_race,rep_incumbent_in_race,flipped
44,7824,2012-AR-04,154149,258953,2012,Arkansas,TomCotton,republican,False,AR-04,...,0.781928,7.992354e-01,0,95013.0,GeneJeffress,154149.0,TomCotton,0,0,1
48,7825,2012-AZ-01,122774,251595,2012,Arizona,AnnKirkpatrick,democrat,False,AZ-01,...,0.476863,4.980625e-01,1,122774.0,AnnKirkpatrick,113594.0,JonathanPaton,0,0,1
52,7826,2012-AZ-02,147338,292279,2012,Arizona,RonBarber,democrat,False,AZ-02,...,0.360602,3.464394e-01,1,147338.0,RonBarber,144884.0,MarthaMcSally,0,0,1
53,8261,2014-AZ-02,109704,219351,2014,Arizona,MarthaMcSally,republican,False,AZ-02,...,0.398993,4.073385e-01,0,109543.0,RonBarber,109704.0,MarthaMcSally,1,0,1
55,9131,2018-AZ-02,161000,294152,2018,Arizona,AnnKirkpatrick,democrat,False,AZ-02,...,0.477971,4.649055e-01,1,161000.0,AnnKirkpatrick,133083.0,LeaMarquezPeterson,0,0,1
56,7827,2012-AZ-03,98468,168698,2012,Arizona,RaulGrijalva,democrat,False,AZ-03,...,0.410103,4.239864e-01,1,98468.0,RaulGrijalva,62663.0,GabrielaSaucedoMercer,0,0,1
60,7828,2012-AZ-04,162907,243760,2012,Arizona,PaulAGosar,republican,False,AZ-04,...,0.725763,7.260491e-01,0,69154.0,JohnnieRobinson,162907.0,PaulAGosar,0,0,1
76,7832,2012-AZ-08,172809,272791,2012,Arizona,TrentFranks,republican,False,AZ-08,...,0.525793,5.174158e-01,0,95635.0,GeneScharer,172809.0,TrentFranks,0,0,1
80,7833,2012-AZ-09,121881,250131,2012,Arizona,KyrstenSinema,democrat,False,AZ-09,...,0.253199,3.797988e-01,1,121881.0,KyrstenSinema,111630.0,VernonBParker,0,0,1
84,7834,2012-CA-01,168827,294213,2012,California,DougLaMalfa,republican,False,CA-01,...,0.708130,7.152290e-01,0,125386.0,JimReed,168827.0,DougLaMalfa,0,0,1


In [220]:
len(flipped_df.district_id.unique())

127

In [224]:
flipped_df.year.value_counts()

2012    71
2018    54
2014    20
2016    12
Name: year, dtype: int64

## but many of the 2012 flips might have been due to redistricting

In [227]:
votes_df_2012_2018_dr['dmargin_45_55']=0
votes_df_2012_2018_dr.loc[(votes_df_2012_2018_dr['winner_voteshare']>.45)&
                          (votes_df_2012_2018_dr['winner_voteshare']<.55), 'dmargin_45_55'] = 1

In [230]:
votes_df_2012_2018_dr.dmargin_45_55.sum()

293

In [229]:
votes_df_2012_2018_dr.dmargin_45_55.sum()/len(votes_df_2012_2018_dr)

0.16839080459770114

In [239]:
votes_df_2012_2018_dr

Unnamed: 0.1,Unnamed: 0,yr_district_id,winner_votes,totalvotes,year,state,winner,winner_party,writein,district_id,...,dem_L2_voteshare,target,top_dem_votes,top_dem_candidate,top_rep_votes,top_rep_candidate,dem_incumbent_in_race,rep_incumbent_in_race,flipped,dmargin_45_55
0,7813,2012-AK-00,185296,289804,2012,Alaska,DonYoung,republican,False,AK-00,...,0.377440,0,82927.0,SharonMCissna,185296.0,DonYoung,0,1,0,0
1,8248,2014-AK-00,142572,279741,2014,Alaska,DonYoung,republican,False,AK-00,...,0.295641,0,114602.0,ForrestDunbar,142572.0,DonYoung,0,1,0,1
2,8683,2016-AK-00,155088,308198,2016,Alaska,DonYoung,republican,False,AK-00,...,0.347910,0,111019.0,SteveLindbeck,155088.0,DonYoung,0,1,0,1
3,9118,2018-AK-00,149779,282166,2018,Alaska,DonYoung,republican,False,AK-00,...,0.384946,0,131199.0,AlyseSGalvin,149779.0,DonYoung,0,1,0,1
4,7814,2012-AL-01,196374,200676,2012,Alabama,JoBonner,republican,False,AL-01,...,0.000000,0,,,196374.0,JoBonner,0,1,0,0
5,8249,2014-AL-01,103758,152234,2014,Alabama,BradleyByrne,republican,False,AL-01,...,0.000000,0,48278.0,BurtonRLeFlore,103758.0,BradleyByrne,0,0,0,0
6,8684,2016-AL-01,208083,215893,2016,Alabama,BradleyByrne,republican,False,AL-01,...,0.158565,0,,,208083.0,BradleyByrne,0,1,0,0
7,9119,2018-AL-01,153228,242617,2018,Alabama,BradleyByrne,republican,False,AL-01,...,0.158565,0,89226.0,RobertKennedyJr,153228.0,BradleyByrne,0,1,0,0
8,7815,2012-AL-02,180591,283953,2012,Alabama,MarthaRoby,republican,False,AL-02,...,0.495120,0,103092.0,ThereseFord,180591.0,MarthaRoby,0,1,0,0
9,8250,2014-AL-02,113103,167952,2014,Alabama,MarthaRoby,republican,False,AL-02,...,0.425483,0,54692.0,ErickWright,113103.0,MarthaRoby,0,1,0,0


In [241]:
votes_df_2012_2018_dr.to_csv('HouseVotesFeatures_12_18_0814PM.csv', index= False)

In [231]:
districts_to_drop = ['AZ-09', 'FL-26', 'FL-27', 'GA-14', 'NV-04',
                     'SC-07', 'TX-33', 'TX-34', 'TX-35', 'TX-36', 'UT-04', 'WA-10']

In [232]:
votes_df_2012_2018_droppednewdistrict = votes_df_2012_2018_dr[votes_df_2012_2018_dr.district_id.isin(districts_to_drop) == False]



In [234]:
len(votes_df_2012_2018_dr)-len(votes_df_2012_2018_droppednewdistrict)

48

In [236]:
votes_df_2012_2018_droppednewdistrict.to_csv('HouseVotesFeatures_12_18_0814PM_droppednewdistrict.csv', index= False)