## Predicting Results of US Elections on a County Level

#### Table of Contents <a name='top'></a>

- [Load Modules and Set Notebook Properties](#modules)
- [Define Path and Load Data](#load)
- [Inspect Data](#inspect)
- [Define a State Selection Function ](#stateselect)
- [Define Scaling Function](#scale)
- [Create Different Classifier Models](#create)
- [Find the Best Model](#evaluate)
- [Evaluate and Choose Models](#evaluate)
- [Predict](#predict)
- [Prepare Submission](#submit)

[go to end](#end)

#### Load Modules and Set Notebook Properties <a name='modules'></a>

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os

In [12]:
from config import ALL_STATES

In [13]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, StandardScaler, MinMaxScaler, Normalizer

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [15]:
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
sns.set_style("darkgrid")

#### Define Path and Load Data  <a name='load'></a> 

In [16]:
OUTPUT_PATH = 'outputs'

In [17]:
df = pd.read_csv(os.path.join(OUTPUT_PATH, 'results_2020.csv'))
df_ = df[['county_fips', 'state', 'winner']]

In [18]:
df2 = pd.read_csv(os.path.join(OUTPUT_PATH, 'rate_natural_increase_population_2020.csv'))
df3 = pd.read_csv(os.path.join(OUTPUT_PATH, 'rate_international_migration_2020.csv'))
df4 = pd.read_csv(os.path.join(OUTPUT_PATH, 'rate_domestic_migration_2020.csv'))
df5 = pd.read_csv(os.path.join(OUTPUT_PATH, 'facebook_2014.csv'))
df6 = pd.read_csv(os.path.join(OUTPUT_PATH, 'social_capital.csv')) 
df7 = pd.read_csv(os.path.join(OUTPUT_PATH, 'infra.csv')) 
df8 = pd.read_csv(os.path.join(OUTPUT_PATH, 'broadband.csv')) 

In [19]:
merged = df_.merge(df2, on='county_fips', how='left')\
            .merge(df3, on='county_fips', how='left')\
            .merge(df4, on='county_fips', how='left')\
            .merge(df5, on='county_fips', how='left')\
            .merge(df6, on='county_fips', how='left')\
            .merge(df7, on='county_fips', how='left')\
            .merge(df8, on='county_fips', how='left')

In [20]:
merged.fillna(0, inplace=True)

#### Inspect Data  <a name='inspect'></a> 

In [21]:
merged.head()

Unnamed: 0,county_fips,state,winner,rate_natural_increase_population,rate_international_migration,rate_domestic_migration,sh050m,sh100m,sh500m,Religious2014,Civic2014,Business2014,Political2014,Professional2014,Labor2014,Bowling2014,Recreational2014,Golf2014,Sports2014,assn2014,pvote2012,respn2010,nccs2014,sk2014,streets_per_node_avg,pctpopwbbacc
0,1001.0,ALABAMA,republican,1.490099,-0.287248,4.84731,0.206411,0.35084,0.291315,53.0,7.0,3.0,0.0,1.0,4.0,1.0,5.0,2.0,0.0,1.37457,0.644956,0.78,157.0,-0.631003,2.455156,77.6
1,1003.0,ALABAMA,republican,-0.099753,0.362739,24.017829,-0.099281,-0.641534,-0.209279,169.0,16.0,9.0,0.0,3.0,1.0,1.0,25.0,7.0,0.0,1.15666,0.674735,0.73,718.0,-0.555396,2.501281,72.7
2,1005.0,ALABAMA,republican,-2.259978,0.524638,-5.690302,0.359257,0.516236,0.666761,19.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.820436,0.665031,0.63,92.0,-0.891036,2.526385,49.4
3,1007.0,ALABAMA,republican,-0.536181,0.446818,1.385134,1.582025,1.260516,0.917058,20.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.020001,0.656838,0.58,54.0,-0.906582,2.398802,1.7
4,1009.0,ALABAMA,republican,-0.103809,0.103809,1.020788,1.582025,1.260516,0.917058,39.0,0.0,1.0,0.0,0.0,3.0,0.0,3.0,3.0,0.0,0.849839,0.708668,0.8,108.0,-1.01328,2.33452,53.8


In [33]:
merged.mean()

county_fips                         30295.029505
rate_natural_increase_population        0.528214
rate_international_migration            0.625945
rate_domestic_migration                -0.270653
sh050m                                  0.000111
sh100m                                  0.000381
sh500m                                  0.000609
Religious2014                          58.168464
Civic2014                               8.195114
Business2014                            4.981916
Political2014                           0.834391
Professional2014                        2.082805
Labor2014                               4.484772
Bowling2014                             1.224937
Recreational2014                       10.003173
Golf2014                                3.609454
Sports2014                              0.302348
assn2014                                1.365616
pvote2012                               0.660039
respn2010                               0.696488
nccs2014            

#### Define a State Selection Function <a name='stateselect'></a> 

In [22]:
def select_states(states=ALL_STATES):
    
    return merged[merged['state'].isin(states)]

#### Define Scaling Function <a name='scale'></a> 

In [23]:
def scale_values(X_train, X_test, scaler='standard'):
    
    scaler_dict = {'standard': StandardScaler(), 
                    'minmax': MinMaxScaler(), 
                    'normal': Normalizer()}
    if scaler is None:
        return X_train, X_test
    elif scaler not in scaler_dict.keys():
        raise ValueError("Enter a valid value for scaler! Choose between 'standard', 'minmax', 'normal' or None.")
    else:
        scl = scaler_dict[scaler]
        X_train = scl.fit_transform(X_train)
        X_test = scl.transform(X_test) 
        return X_train, X_test

#### Define Classifiers

In [24]:
knn = KNeighborsClassifier(n_neighbors=5)
svc = SVC(probability=True)
log_reg =  LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
nb = BernoulliNB(binarize=True)
dt = DecisionTreeClassifier(criterion = 'entropy', 
                            max_depth=3, 
                            random_state = 0)
xgb = XGBClassifier(n_estimators=100, 
                    objective='binary:logistic', 
                    use_label_encoder=False, 
                    eval_metric='error')

In [25]:
classifier_dict = {'k-Nearest Neighbors': knn,
                   'Support Vector': svc,
                   'Logistic Regression': log_reg,
                   'Naive Bayes': nb,
                   'Decision Tree': dt,
                   'XGBoost': xgb}

#### Test Classifiers without Hyperparameter Tuning

In [26]:
final = select_states(['ARIZONA', 'GEORGIA', 'PENNSYLVANIA', 'FLORIDA', 'OHIO', 'MICHIGAN', 'WISCONSIN'])
# final = select_states()
X = final.drop(['county_fips', 'state', 'winner', 'sh100m','sh500m',
                'Business2014','Political2014','Professional2014',
                'Labor2014','Bowling2014','Recreational2014','Golf2014',
                'Sports2014','assn2014','pvote2012','respn2010','nccs2014','sk2014'], axis=1)
y = [1 if x == 'republican' else 0 for x in final['winner']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=27)
X_train, X_test = scale_values(X_train, X_test, scaler='minmax')

In [27]:
print('Accuracy Scores')
print('===============')
for i in classifier_dict:
    clf = classifier_dict[i]
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    accuracy = accuracy_score(pred, y_test)
    print('{}: Accuracy: {:.4f}, AUC {:.4f}'.format(i, accuracy, roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])))

Accuracy Scores
k-Nearest Neighbors: Accuracy: 0.8976, AUC 0.8122
Support Vector: Accuracy: 0.8614, AUC 0.8571
Logistic Regression: Accuracy: 0.8916, AUC 0.8545
Naive Bayes: Accuracy: 0.8554, AUC 0.5000
Decision Tree: Accuracy: 0.8916, AUC 0.6812
XGBoost: Accuracy: 0.8976, AUC 0.8407


#### Tune Classifiers

In [28]:
knn2 = KNeighborsClassifier()
svc2 = SVC()
log_reg2 =  LogisticRegression()
dt2 = DecisionTreeClassifier()
xgb2 = XGBClassifier()

In [29]:
knn_params = {'n_neighbors': list(range(1,20))}
svc_params = {'C': np.logspace(-3, 2, 6), 
              'gamma': np.logspace(-3, 2, 6),
              'probability': [True]}
logreg_params = {'solver': ['newton-cg', 'lbfgs', 'liblinear'],
                 'penalty': ['l2'],
                 'C': [100, 10, 1.0, 0.1, 0.01]}
dt_params = {'criterion': ['gini', 'entropy'],
             'max_depth': range(1,10),
             'min_samples_split': range(1,10),
             'min_samples_leaf': range(1,10)}
xgb_params = {"learning_rate": [0.05, 0.10],
              'n_estimators': [10, 20, 40, 100],
              'objective': ['binary:logistic'],
              'use_label_encoder': [False],
              'eval_metric': ['auc']}

In [30]:
classifier_dict2 = {'k-Nearest Neighbors': [knn2, knn_params],
                    'Support Vector': [svc2, svc_params],
                    'Logistic Regression': [log_reg2, logreg_params],
                    'Decision Tree': [dt2, dt_params],
                    'XGBoost': [xgb2, xgb_params]}

In [31]:
print('Hyperparameter Optimization')
print('===========================')
for i in classifier_dict2:
    print(i)
    clf = classifier_dict2[i][0]
    params = classifier_dict2[i][1]
    grid = GridSearchCV(clf, params, cv=3, scoring='accuracy')
    grid.fit(X_train, y_train)
    print(f'Best Parameters: {grid.best_params_}')
    print('Accuracy on Training: {:.4f}'.format(grid.best_score_))
    roc_auc_score
    
    best_model = grid.best_estimator_
    pred = best_model.predict(X_test)
    accuracy = accuracy_score(pred, y_test)
    print('Accuracy on Test: {:.4f}'.format(accuracy))
    print('AUC on Test: {:.4f}'.format(roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])))
    print('+++++++++++++++++++++++++++')

Hyperparameter Optimization
k-Nearest Neighbors
Best Parameters: {'n_neighbors': 12}
Accuracy on Training: 0.8806
Accuracy on Test: 0.8855
AUC on Test: 0.8578
+++++++++++++++++++++++++++
Support Vector
Best Parameters: {'C': 1.0, 'gamma': 10.0, 'probability': True}
Accuracy on Training: 0.8728
Accuracy on Test: 0.8976
AUC on Test: 0.8427
+++++++++++++++++++++++++++
Logistic Regression
Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy on Training: 0.8702
Accuracy on Test: 0.8976
AUC on Test: 0.8530
+++++++++++++++++++++++++++
Decision Tree
Best Parameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy on Training: 0.8780
Accuracy on Test: 0.8675
AUC on Test: 0.6338
+++++++++++++++++++++++++++
XGBoost
Best Parameters: {'eval_metric': 'auc', 'learning_rate': 0.05, 'n_estimators': 40, 'objective': 'binary:logistic', 'use_label_encoder': False}
Accuracy on Training: 0.8728
Accuracy on Test: 0.8916
AUC on Test: 0.8360

In [32]:
X.columns

Index(['rate_natural_increase_population', 'rate_international_migration',
       'rate_domestic_migration', 'sh050m', 'Religious2014', 'Civic2014',
       'streets_per_node_avg', 'pctpopwbbacc'],
      dtype='object')