In [202]:
%pylab inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.api import OLS
from linearmodels.panel import PanelOLS
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

Populating the interactive namespace from numpy and matplotlib


In [227]:
df = pd.read_csv("./SSI ACA Data/ACA_ENROLL.csv")
df2 = pd.read_csv("./SSI ACA Data/state_effects.csv")
df = df.merge(df2, left_on="State", right_on="entity")

#split the 50 states into training and test sets
trainx, testx, trainy, testy = train_test_split(df[["d_pct_EmpPerCap_1", "d_pct_ACA_Enroll", "MedExpDummy", "effects"]], 
                                                df.d_pct_BenePerCap, test_size=0.2)
train_set = pd.concat([trainx, trainy], axis=1)
test_set = pd.concat([testx, testy], axis=1)

In [199]:
mod_lin_enroll = OLS.from_formula("d_pct_BenePerCap ~ d_pct_EmpPerCap_1 + d_pct_ACA_Enroll + MedExpDummy + effects -1 ", train_set).fit()
print(mod_lin_enroll.summary())

                            OLS Regression Results                            
Dep. Variable:       d_pct_BenePerCap   R-squared:                       0.422
Model:                            OLS   Adj. R-squared:                  0.358
Method:                 Least Squares   F-statistic:                     6.569
Date:                Thu, 28 Sep 2017   Prob (F-statistic):           0.000447
Time:                        01:39:54   Log-Likelihood:                 139.66
No. Observations:                  40   AIC:                            -271.3
Df Residuals:                      36   BIC:                            -264.6
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
d_pct_EmpPerCap_1    -0.8670      0.22

In [200]:
mod_lin_minimal = OLS.from_formula("d_pct_BenePerCap ~ d_pct_EmpPerCap_1 + effects -1 ", train_set).fit()
print(mod_lin_minimal.summary())

                            OLS Regression Results                            
Dep. Variable:       d_pct_BenePerCap   R-squared:                       0.395
Model:                            OLS   Adj. R-squared:                  0.363
Method:                 Least Squares   F-statistic:                     12.38
Date:                Thu, 28 Sep 2017   Prob (F-statistic):           7.23e-05
Time:                        01:40:00   Log-Likelihood:                 138.73
No. Observations:                  40   AIC:                            -273.5
Df Residuals:                      38   BIC:                            -270.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
d_pct_EmpPerCap_1    -0.9693      0.19

In [228]:
#just to exercise doing 5 fold cross validation, I did a nonlinear kernel ridge regression as well
#the below tunes the alpha parameter
from sklearn.preprocessing import StandardScaler

train_set["id"] = range(0, len(train_set))
train_set = train_set.reset_index()
train_set = train_set.set_index("id")

scaler = StandardScaler()
X= scaler.fit_transform(train_set[["d_pct_EmpPerCap_1", "d_pct_ACA_Enroll", "MedExpDummy", "effects"]])
y= scaler.fit_transform(train_set["d_pct_BenePerCap"].reshape(-1,1))
alphas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.9, 0.1, 0.12, 0.15]
best_score = -np.inf
best_alpha= None
for alpha in alphas:
    krm = KernelRidge(alpha=alpha, kernel='rbf', gamma=0.1)
    result = 0
    for train, test in KFold(5).split(train_set):
        krm.fit(X[train], y[train])
        krm.predict(X[test])
        result += krm.score(X[test], y[test])
    score = result/5
    if score > best_score:
        best_alpha = alpha
        best_score=score
krm.fit(X,y)

  # This is added back by InteractiveShellApp.init_path()


KernelRidge(alpha=0.15, coef0=1, degree=3, gamma=0.1, kernel='rbf',
      kernel_params=None)

In [229]:
#so we will now test the three models against the test set.  first, the R squared for the kernel ridge model
krm_preds= krm.predict(test_set[["d_pct_EmpPerCap_1", "d_pct_ACA_Enroll", "MedExpDummy", "effects"]])

r2_score(y_pred=krm_preds, y_true=test_set["d_pct_BenePerCap"])

-4597.7415167079762

In [204]:
enroll_preds = mod_lin_enroll.predict(test_set[["d_pct_EmpPerCap_1", "d_pct_ACA_Enroll", "MedExpDummy", "effects"]])
r2_score(y_pred=enroll_preds, y_true=test_set["d_pct_BenePerCap"])

0.054869559774659349

In [213]:
min_preds = mod_lin_minimal.predict(test_set[["d_pct_EmpPerCap_1", "effects"]])
r2_score(y_pred=min_preds, y_true=test_set["d_pct_BenePerCap"])

0.073784247849253415