In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import auc
import time
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
## Reading in the data
data = pd.read_csv('political_demographics.csv')
data_go = pd.get_dummies(data.loc[:,'General_2016-11-08':'GetOut'])
data_preference = pd.get_dummies(pd.concat([data.loc[:,'General_2016-11-08':'Parties_Description'],
                                         data.loc[:,'preference']],axis= 1))
#data_go.shape
data_preference.shape

(1850, 101)

In [3]:
## Creating training and test data for turnout and preference

features = data.loc[:,'Voters_Age':'Parties_Description_Republican']

## Voter Turnout (GetOut)
X_go = features.values
y_go = data['GetOut_Y'].values
## Splitting data into 90 - 10 split for taining and testing
X_train_go, X_test_go, y_train_go, y_test_go = train_test_split(X_go,y_go, test_size = 0.1, random_state = 12)

## Voter Preference (preference)
X_pre = features.values
y_pre = data['preference_Y'].values
X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(X_pre,y_pre, test_size = 0.1, random_state = 12)

In [5]:
## Pipeline of 4 models with varying parameters. Logistic Regression, Random Forest, XGBoost, and Adaptive Boosting
pipe = Pipeline([('classifier', LogisticRegression())])
pipeline_grid = [
    {'classifier': [LogisticRegression()],
    'classifier__C' : [0.001,0.01,0.1,1,10,100]},
    {'classifier': [RandomForestClassifier()],
    'classifier__max_features'  : [4, 'sqrt', 'log2'],
    'classifier__n_estimators' : [500,1000, 1500, 2000]},
    {'classifier': [XGBClassifier()],
    'classifier__learning_rate' : [0.1,0.3, 0.05, 0.75],
    'classifier__n_estimators': [50, 100, 200, 300, 500],
    'classifier__max_depth': [3,5]},
    {'classifier': [AdaBoostClassifier()],
    'classifier__learning_rate' : [0.1,0.3, 0.05, 0.75],
    'classifier__n_estimators': [50, 100, 200, 300, 500]}
]

# Voter Turnout Model

In [52]:
## Turnout Model, using the model with the highest recall as the best model
start = time.time()
## Using as many cores as possible for faster training
grid = GridSearchCV(pipe, pipeline_grid, cv = 10, n_jobs=-1,scoring= 'recall')
grid.fit(X_train_go, y_train_go)
print(time.time() - start)
grid.score(X_test_go, y_test_go)

380.03441309928894


0.27500000000000002

In [54]:
## Best model outputted with paramters
print("Best params:\n{}\n".format(grid.best_params_))

Best params:
{'classifier': XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.75, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=nan, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1), 'classifier__learning_rate': 0.75, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}



In [55]:
## Confuion matrix
predicted = grid.predict(X_test_go)
confusion_matrix(y_test_go,predicted)

array([[118,  27],
       [ 29,  11]])

# Preference Model

In [59]:
### Preference Model
start = time.time()
grid = GridSearchCV(pipe, pipeline_grid, cv = 10, n_jobs=3,scoring= 'roc_auc')
grid.fit(X_train_pre, y_train_pre)
print(time.time()-start)
grid.score(X_test_pre, y_test_pre)

409.4400050640106


0.55524278676988037

In [60]:
## Best model outputted with paramters
print("Best params:\n{}\n".format(grid.best_params_))

Best params:
{'classifier': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=4, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False), 'classifier__max_features': 4, 'classifier__n_estimators': 500}



In [61]:
predicted = grid.predict(X_test_pre)
confusion_matrix(y_test_pre,predicted)

array([[39, 48],
       [38, 60]])

# SVM

In [6]:
## Trying out Support Vector Machines using the Radial Basis Function
from sklearn.svm import SVC
svm = data.loc[:,'General_2016-11-08_N':]

features_svm = svm_go.loc[:,'General_2016-11-08_N':'Parties_Description_Republican']

X_svm_go = features_svm_go.values
y_svm_go = svm_go['GetOut_Y'].values
X_train_svm_go, X_test_svm_go, y_train_svm_go, y_test_svm_go = train_test_split(X_svm_go,y_svm_go, test_size = 0.1, random_state = 12)

X_svm_pre = features_svm_pre.values
y_svm_pre = svm_pre['preference_Y'].values
X_train_svm_pre, X_test_svm_pre, y_train_svm_pre, y_test_svm_pre = train_test_split(X_svm_pre,y_svm_pre, test_size = 0.1, random_state = 12)

In [9]:
## SVM parameter grid
## Voter turnout model
param_grid_svm = {'C' : [0.001,0.01,0.1,1,100],
             'gamma': [0.001,0.01,0.1,10,100]}

grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv = 10, n_jobs=3, scoring = 'recall')

start = time.time()
grid_search_svm.fit(X_train_svm_go, y_train_svm_go)
print(time.time() - start)
grid_search_svm.score(X_test_svm_go, y_test_svm_go)

33.08904790878296


0.1206896551724138

In [10]:
print("Best params:\n{}\n".format(grid_search_svm.best_params_))

Best params:
{'C': 100, 'gamma': 0.01}



In [11]:
predicted = grid_search_svm.predict(X_test_svm_go)
confusion_matrix(y_test_svm_go,predicted)

array([[301,  46],
       [102,  14]])

In [43]:
## Preference Model

param_grid_svm = {'C' : [0.001,0.01,0.1,1,100],
             'gamma': [0.001,0.01,0.1,10,100]}

grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv = 10, n_jobs=3, scoring = 'roc_auc')

start = time.time()
grid_search_svm.fit(X_train_svm_pre, y_train_svm_pre)
print(time.time() - start)
grid_search_svm.score(X_test_svm_pre, y_test_svm_pre)

40.434685468673706


0.51626031507876968

In [44]:
print("Best params:\n{}\n".format(grid_search_svm.best_params_))

Best params:
{'C': 1, 'gamma': 0.1}



In [45]:
predicted = grid_search_svm.predict(X_test_svm_pre)
confusion_matrix(y_test_svm_pre,predicted)

array([[ 75, 140],
       [ 86, 162]])

# Running same pipeline grid searches for turnout model with a resampled dataset using SMOTE

In [41]:
#### Employing SMOTE on the training set
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(X_train_go, y_train_go)
x_train_res = np.round(x_train_res)

In [42]:
start = time.time()
grid_res = GridSearchCV(pipe, pipeline_grid, cv = 10, n_jobs=3,scoring= 'recall')
grid_res.fit(x_train_res, y_train_res)
print(time.time()-start)
grid_res.score(X_test_go, y_test_go)

514.4253783226013


0.64353448275862069

In [43]:
## Printing the best model with it's paramters
print("Best params:\n{}\n".format(grid_res.best_params_))

Best params:
{'classifier': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=4, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False), 'classifier__max_features': 4, 'classifier__n_estimators': 500}



In [44]:
predicted = grid_res.predict(X_test_go)
confusion_matrix(y_test_go,predicted)

array([[129,  16],
       [ 33,   7]])

In [None]:
#### Employing SMOTE on SVM model
sm = SMOTE(random_state=12, ratio = 1.0)
X_train_svm_go_res, y_train_svm_go_res = sm.fit_sample(X_train_svm_go, y_train_svm_go)
X_train_svm_go_res = np.round(X_train_svm_go_res)

In [None]:
grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv = 10, n_jobs=3, scoring = 'recall')

start = time.time()
grid_search_svm.fit(X_train_svm_go_res, y_train_svm_go_res)
print(time.time() - start)
grid_search_svm.score(X_test_svm_go, y_test_svm_go)