# Project #3
## Predicting Voting Results

In [26]:
import geopandas as gpd
import mapclassify
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import tree
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [61]:
gdf = gpd.read_file('./va_admin_bndry/VirginiaCounty_ClippedToShoreline.shx')
vdf = pd.read_csv('./data/voting_VA.csv')
cdf = pd.read_csv('./data/county_adjacencies.csv')


In [62]:
gdf['FIPS_left'] = pd.to_numeric(gdf['STCOFIPS']) 
df = gdf.merge(cdf,left_on='FIPS_left',right_on='FIPS')

In [65]:
for y in vdf['year'].unique():
    df20 = vdf.loc[ vdf['year']==y,:]

    Dvotes = df20.loc[(df20['party']=='DEMOCRAT'),:].groupby('county_fips')['candidatevotes'].sum()
    Dvotes = Dvotes.rename('dem_votes_' + str(y))
    df = df.merge(Dvotes,left_on='FIPS_left',right_on='county_fips')

    Rvotes = df20.loc[(df20['party']=='REPUBLICAN'),:].groupby('county_fips')['candidatevotes'].sum()
    Rvotes = Rvotes.rename('rep_votes_' + str(y))
    df = df.merge(Rvotes,left_on='FIPS_left',right_on='county_fips')

In [68]:
df.head()

Unnamed: 0,STCOFIPS,GNIS,NAME,NAMELSAD,GSOURCE,LADOPT,LASTUPDATE,JURISTYPE,AREASQMI,Shape_Leng,...,dem_votes_2004_y,rep_votes_2004_y,dem_votes_2008_y,rep_votes_2008_y,dem_votes_2012_y,rep_votes_2012_y,dem_votes_2016_y,rep_votes_2016_y,dem_votes_2020_y,rep_votes_2020_y
0,51001,1480091,Accomack,Accomack County,T,N,2014-08-20,CO,1195.366291,4949957.0,...,5518,7726,7607,7833,7655,8213,6740,8583,7578,9172
1,51003,1675170,Albemarle,Albemarle County,L,Y,2021-03-29,CO,725.634334,231019.8,...,22088,21189,29792,20576,29757,23297,33345,19259,42466,20804
2,51005,1492459,Alleghany,Alleghany County,V,N,2014-09-15,CO,449.544769,206265.4,...,3203,3962,3553,3715,3403,3595,2166,4874,2243,5859
3,51007,1497770,Amelia,Amelia County,T,Y,2021-03-29,CO,360.945234,177661.7,...,1862,3499,2488,3970,2490,4331,2128,4708,2411,5390
4,51009,1480095,Amherst,Amherst County,V,N,2020-09-11,CO,478.766949,187661.0,...,4866,7758,6094,8470,5900,8876,5057,9719,5672,11041


In [67]:
df['vote_diff'] = df['rep_votes_2020']-df['dem_votes_2020']

KeyError: 'rep_votes_2020'

Unnamed: 0,rep_votes_2020,dem_votes_2020,vote_diff
28,168401,419943,-251542
132,5221,6610,-1389
46,23153,25553,-2400
75,2547,1513,1034
33,30558,17207,13351
...,...,...,...
87,2219,2827,-608
103,754,1612,-858
67,7485,1954,5531
24,3019,2227,792


In [50]:
df = df[['rep_votes_2020', 'dem_votes_2020', 'vote_diff']]

In [51]:
# Split data into training rows and testing rows:
N = df.shape[0]
df = df.sample(frac=1, random_state=100) # randomize the order in which data appears
train_size = int(.8*N)
df_train = df[0:train_size]
N_train = df_train.shape[0]
df_test = df[train_size:]
N_test = df_test.shape[0]

# Bootstrap:
T = 1000
m_depth = 5
Rsq = np.zeros(T) # preallocate the rsq measure
y_test_hat = np.zeros([T,N_test]) # preallocate predictions on test set
df_train.head()

# Split data into train/test:
X_train = df_train.drop(['vote_diff'],axis=1)
y_train = df_train['vote_diff']
X_test = df_test.drop(['vote_diff'],axis=1)
y_test = df_test['vote_diff']



In [52]:
# Bootstrapping:
for s in range(T):
    # Generate a bootstrap sample:
    df_s = df_train.sample(frac=1, replace=True)
    X_s = df_s.drop('vote_diff',axis=1)
    y_s = df_s['vote_diff']
    # Fit decision tree:
    cart = tree.DecisionTreeRegressor(max_depth=m_depth) # Create a classifier object
    cart = cart.fit(X_s, y_s) # Fit the classifier
    # Compute Rsq:
    y_hat = cart.predict(X_test)
    SSE = np.sum( (y_test-y_hat)**2 )
    TSS = np.sum( (y_test-y_s.mean())**2 )
    Rsq[s] = 1 - SSE/TSS
    # Make and Save Predictions:
    y_test_hat[s,:] = y_hat

In [53]:


# Ensemble predictor:
y_hat_ensemble = y_test_hat.mean(axis=0) # Average the columns to get the ensemble prediction
SSE = np.sum( (y_test-y_hat_ensemble)**2 )
TSS = np.sum( (y_test-y_train.mean())**2 )
Rsq_ensemble = 1 - SSE/TSS

print(Rsq_ensemble)

0.9202345959414986


In [57]:
y_test_hat[0]

array([  8102.375     ,  17405.        ,   3631.44444444,   -353.05882353,
         -353.05882353,  -5236.        ,   3244.33333333,  12514.        ,
         -353.05882353,   1365.93333333,   3631.44444444,   3244.33333333,
         -353.05882353,  -8594.        ,   8102.375     ,   3631.44444444,
       -10805.        , -12306.        ,  12514.        ,   1365.93333333,
         8102.375     ,   8102.375     ,   -353.05882353,   1365.93333333,
         3244.33333333,   1365.93333333,   1365.93333333])