In [31]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

import statsmodels.api as sm


import sys
sys.path.append("../")

from src import modelers


plt.style.use('ggplot')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
# Import data
pd.set_option("display.max_columns", 40)
data = pd.read_csv('data/cleaned_data2.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,state,county,fips,white_pct,black_pct,hispanic_pct,foreignborn_pct,female_pct,age29andunder_pct,age65andolder_pct,median_hh_inc,clf_unemploy_pct,lesshs_pct,lesscollege_pct,rural_pct,trump16_pct,clinton16_pct,otherpres16_pct,romney12_pct,obama12_pct,otherpres12_pct,cvap_pct12,metro,rural,urban_metroadj,urban_not_metroadj
0,0,Alabama,Autauga,1001,75.683482,18.370906,2.572254,1.838362,51.176225,40.037058,13.978456,53099.0,5.591657,12.417046,75.407229,42.002162,72.766588,23.769671,3.463741,72.618252,26.587832,0.793916,58.815434,1,0,0,0
1,1,Alabama,Baldwin,1003,83.178788,9.225603,4.366698,3.26951,51.194928,35.474412,18.714851,51365.0,6.286843,9.972418,70.452889,42.279099,76.545712,19.385601,4.068687,77.358269,21.589444,1.052286,56.228504,1,0,0,0
2,2,Alabama,Barbour,1005,45.885624,47.888329,4.309762,2.859397,46.498084,37.664387,16.528895,33956.0,12.824738,26.235928,87.132213,67.789635,52.096666,46.527844,1.37549,48.22313,51.368494,0.408376,56.48589,0,0,1,0
3,3,Alabama,Bibb,1007,74.765196,21.212121,2.223994,1.351232,46.464646,37.329435,14.885699,39776.0,7.146827,19.301587,88.0,68.352607,76.40322,21.249575,2.347205,72.826603,26.152019,1.021378,47.868107,1,0,0,0
4,4,Alabama,Blount,1009,87.657701,1.557951,8.727298,4.271801,50.485235,37.240053,17.192916,46212.0,5.953833,19.968585,86.950243,89.951502,89.334844,8.425825,2.239331,86.465884,12.371907,1.162209,56.577893,1,0,0,0


In [4]:
# Extract our labels
trump = data.pop('trump16_pct')
clinton = data.pop('clinton16_pct')
other = data.pop('otherpres16_pct')

# Drop columns that we think are too correlated with one another
data = data.drop(columns=['Unnamed: 0', 'state', 'county', 'fips', 
                           'otherpres12_pct', 'black_pct', 'hispanic_pct'])
data2 = data.drop(columns=['obama12_pct'])
data3 = data2.drop(columns=['metro', 'rural', 'urban_metroadj', 'urban_not_metroadj'])

In [6]:
# Make our X and y and do 75%/25% split
X = data
y = trump

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_trainn, X_testn = modelers.normalize(X_train, X_test)

# Keep track of indices
train_idx = X_train.index.values
test_idx = X_test.index.values

In [24]:
# Check model
rmse, r_sq, y_hat = modelers.random_forest(X_trainn, X_testn, y_train, y_test)
rmse, r_sq

(2.851794959738829, 0.9673998792699755)

In [32]:
# Save predictions
predictions = pd.DataFrame({'index_values': test_idx, 'trump_predict': y_hat, 'trump_true': y_test, 
              'clinton_predict': 100-y_hat, 'clinton_true': 100-y_test})
predictions.to_csv('data/predictions_RF.csv')


In [33]:
predictions

Unnamed: 0,index_values,trump_predict,trump_true,clinton_predict,clinton_true
272,272,24.079597,23.862324,75.920403,76.137676
2901,2901,41.206477,31.075051,58.793523,68.924949
654,654,61.579666,60.104037,38.420334,39.895963
1857,1857,32.960651,31.198488,67.039349,68.801512
1462,1462,43.749075,43.163066,56.250925,56.836934
...,...,...,...,...,...
2524,2524,87.070023,88.394062,12.929977,11.605938
865,865,69.651887,68.962650,30.348113,31.037350
2606,2606,56.841940,57.749713,43.158060,42.250287
1864,1864,72.372667,70.113230,27.627333,29.886770
