In [77]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

from src import modelers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Prediction algorithms (supervised learning)
1. Linear Regression for predicted population count for each winner (multivariate linear regression)


1. Logistic (get beta values, or log odds ratio and look at predict proba)
2. KNN
3. Gradient Boosting Classifier (Random Forest)
Don't forget confusion matrix!!

4. We need state results (and electoral college votes) to see if they match for sum of county results.

Can we get updated demographic information to predict the 2020 elections?

5. Can we scrape twitter data and look at popularity? Maybe use NLP/NMF to look at latent features for the different tweets? Too ambitious....
6. Hate crime statistics? Starter code to download from github is in data_clean script




In [39]:
# Import data
pd.set_option("display.max_columns", 40)
data = pd.read_csv('data/cleaned_data2.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,state,county,fips,white_pct,black_pct,hispanic_pct,foreignborn_pct,female_pct,age29andunder_pct,age65andolder_pct,median_hh_inc,clf_unemploy_pct,lesshs_pct,lesscollege_pct,rural_pct,trump16_pct,clinton16_pct,otherpres16_pct,romney12_pct,obama12_pct,otherpres12_pct,cvap_pct12,metro,rural,urban_metroadj,urban_not_metroadj
0,0,Alabama,Autauga,1001,75.683482,18.370906,2.572254,1.838362,51.176225,40.037058,13.978456,53099.0,5.591657,12.417046,75.407229,42.002162,72.766588,23.769671,3.463741,72.618252,26.587832,0.793916,58.815434,1,0,0,0
1,1,Alabama,Baldwin,1003,83.178788,9.225603,4.366698,3.26951,51.194928,35.474412,18.714851,51365.0,6.286843,9.972418,70.452889,42.279099,76.545712,19.385601,4.068687,77.358269,21.589444,1.052286,56.228504,1,0,0,0
2,2,Alabama,Barbour,1005,45.885624,47.888329,4.309762,2.859397,46.498084,37.664387,16.528895,33956.0,12.824738,26.235928,87.132213,67.789635,52.096666,46.527844,1.37549,48.22313,51.368494,0.408376,56.48589,0,0,1,0
3,3,Alabama,Bibb,1007,74.765196,21.212121,2.223994,1.351232,46.464646,37.329435,14.885699,39776.0,7.146827,19.301587,88.0,68.352607,76.40322,21.249575,2.347205,72.826603,26.152019,1.021378,47.868107,1,0,0,0
4,4,Alabama,Blount,1009,87.657701,1.557951,8.727298,4.271801,50.485235,37.240053,17.192916,46212.0,5.953833,19.968585,86.950243,89.951502,89.334844,8.425825,2.239331,86.465884,12.371907,1.162209,56.577893,1,0,0,0


In [40]:
data.drop(columns=['Unnamed: 0', 'state', 'county', 'fips', 'otherpres12_pct'], inplace=True)

In [41]:
# Split into metro, rural, urban 
metro = data[data['metro'] == 1]
rural = data[data['rural'] == 1]
urban_metroadj = data[data['urban_metroadj'] == 1]
urban_not_metroadj = data[data['urban_not_metroadj'] == 1]

In [42]:
def train_test_split_rep(df):

    # Pop ys
    trump = df.pop('trump16_pct').values()
    clinton = df.pop('clinton16_pct').values()
    other = df.pop('otherpres16_pct').values()
    X = df.values()
        
    # Make train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, trump)
    
    

In [43]:
# Extract our labels
trump = data.pop('trump16_pct')
clinton = data.pop('clinton16_pct')
other = data.pop('otherpres16_pct')

In [69]:
# Make our X and y and do 75%/25% split
X = data.values
y = trump.values

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [70]:
X_train.shape, X_test.shape

((2333, 19), (778, 19))

## Test toy model before using Kfold

In [76]:
# Standardize and fit model
X_trains, X_tests = modelers.standardize(X_train, X_test)
rmse, y_hat, model_coeff = modelers.lin_regression(X_trains, X_tests, y_train, y_test)
rmse, model_coeff

(3.499731157329143,
 array([ 1.86356975, -0.76093468, -0.71334025, -0.40602357,  0.15841547,
        -0.50062841,  0.06509976,  0.5780286 , -0.55113052,  0.53122248,
         3.76370228,  0.40379005,  8.50221868, -3.38176721, -0.56367925,
        -0.19521207,  0.19011703, -0.01400891,  0.06244529]))

In [96]:
coeff = pd.DataFrame(data = model_coeff, index=data.columns, columns=["beta"])
coeff = coeff.apply(lambda x: round(x,2))
coeff.sort_values(by='beta', ascending=False)

Unnamed: 0,beta
romney12_pct,8.5
lesscollege_pct,3.76
white_pct,1.86
median_hh_inc,0.58
lesshs_pct,0.53
rural_pct,0.4
rural,0.19
female_pct,0.16
age65andolder_pct,0.07
urban_not_metroadj,0.06


| Column   |      Beta     |  Column |   Beta    |
|----------|:-------------:|------|-------:|
romney12_pct | 8.50 | urban_metroadj | 0.01
lesscollege_pct | 3.76 |metro | -0.20
white_pct | 1.86 |foreignborn_pct | -0.41
median_hh_inc | 0.58 |age29andunder_pct | -0.50
lesshs_pct | 0.53 |clf_unemploy_pct | -0.55
rural_pct | 0.40 | cvap_pct12 | -0.56
rural | 0.19 |hispanic_pct | -0.71
female_pct | 0.16 |black_pct | -0.76
age65andolder_pct | 0.07 |obama12_pct | -3.38
urban_not_metroadj |0.06 |












In [81]:
# Get p-values by using stats models
model = sm.OLS(y_train, X_trains).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.056
Model:,OLS,Adj. R-squared (uncentered):,0.049
Method:,Least Squares,F-statistic:,7.635
Date:,"Wed, 13 May 2020",Prob (F-statistic):,9.25e-20
Time:,12:34:43,Log-Likelihood:,-12984.0
No. Observations:,2333,AIC:,26000.0
Df Residuals:,2315,BIC:,26110.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.8636,4.192,0.445,0.657,-6.357,10.085
x2,-0.7609,3.214,-0.237,0.813,-7.063,5.541
x3,-0.7133,3.323,-0.215,0.830,-7.231,5.804
x4,-0.4060,2.413,-0.168,0.866,-5.138,4.326
x5,0.1584,1.493,0.106,0.916,-2.770,3.087
x6,-0.5006,2.803,-0.179,0.858,-5.998,4.997
x7,0.0651,3.008,0.022,0.983,-5.833,5.964
x8,0.5780,2.538,0.228,0.820,-4.399,5.555
x9,-0.5511,1.859,-0.297,0.767,-4.196,3.094

0,1,2,3
Omnibus:,1338.997,Durbin-Watson:,0.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24434.844
Skew:,-2.33,Prob(JB):,0.0
Kurtosis:,18.154,Cond. No.,5610000000000000.0


## KFold 

In [88]:
kf = KFold(n_splits=5, shuffle=True)  # almost always use shuffle=True
fold_scores = []
y_hat_mean = []

for train, test in kf.split(X_trains):
    rmse, y_hat, coeff = modelers.lin_regression(X_trains[train], 
                                          X_trains[test], 
                                          y_train[train], 
                                          y_train[test])
    fold_scores.append(rmse)
    y_hat_mean.append(y_hat.mean())
    
print(np.mean(fold_scores), np.mean(y_hat_mean))

3.293913318707857 63.12573505450214


In [89]:
3.29/63.12

0.052122940430925226