In [145]:
%pylab inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from linearmodels.panel import PanelOLS
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick


Populating the interactive namespace from numpy and matplotlib


In [168]:
df = pd.read_csv("./SSI ACA Data/PANEL_SSI.csv")
df = df[logical_and(df['State']!="United States", df['Year']!=2006)]
df = df.drop("Unnamed: 0", axis=1)

In [169]:
index = pd.MultiIndex.from_product([df.State.unique(), df.Year.unique()], names=["entity", "time"])

In [170]:
df = df.set_index(index)

In [171]:
#get all possible features from the feature set we've chosen
possible_xs = list(df.columns[logical_and(df.columns.str.contains("d_pct"),~(df.columns.str.contains("BenePerCap")))])
xs = []
possible_xs

['d_pct_CompPerJob',
 'd_pct_PIPerCap',
 'd_pct_GDPPerCap',
 'd_pct_CompPerCap',
 'd_pct_WagePerCap',
 'd_pct_EmpPerCap',
 'd_pct_AvgAge',
 'd_pct_Pov',
 'd_pct_CompPerJob_1',
 'd_pct_PIPerCap_1',
 'd_pct_GDPPerCap_1',
 'd_pct_CompPerCap_1',
 'd_pct_WagePerCap_1',
 'd_pct_EmpPerCap_1',
 'd_pct_AvgAge_1',
 'd_pct_Pov_1']

In [172]:
#step forward feature selection using panel OLS (built in sklearn implementation doesnt do PanelOLS as far as I can tell)
#I am selecting features based on the overall significance of the model, starting with a univariate model
#and adding additional variables as they improve the F score of the model

highest=0
best_x = ""
num_items = len(possible_xs)
for i in range(0,num_items):
    for j in range(0,len(possible_xs)):
        params = xs[:]
        params.append(possible_xs[j])
        model = PanelOLS(df.d_pct_BenePerCap, df[params], entity_effects=True)
        res = model.fit(cov_type='clustered', cluster_entity=True)
        if res.f_statistic.stat >= highest:
            highest = res.f_statistic.stat
            best_x = possible_xs[j]
    if xs.count(best_x)==0:
        xs.append(best_x)
        possible_xs.remove(best_x)


In [174]:
model = PanelOLS(df.d_pct_BenePerCap, df[xs], entity_effects=True)
res = model.fit(cov_type='clustered', cluster_entity=True)
res

0,1,2,3
Dep. Variable:,d_pct_BenePerCap,R-squared:,0.2199
Estimator:,PanelOLS,R-squared (Between):,-0.0375
No. Observations:,450,R-squared (Within):,0.2199
Date:,"Thu, Sep 28 2017",R-squared (Overall):,0.0584
Time:,02:11:21,Log-likelihood,1257.2
Cov. Estimator:,Clustered,,
,,F-statistic:,112.49
Entities:,50,P-value,0.0000
Avg Obs:,9.0000,Distribution:,"F(1,399)"
Min Obs:,9.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
d_pct_EmpPerCap_1,-0.4071,0.0540,-7.5322,0.0000,-0.5133,-0.3008


In [173]:
xs

['d_pct_EmpPerCap_1']

In [152]:
#grab each states estimated fixed effects for later use
df = res.estimated_effects.reset_index()
df["effects"] = round(df["effects"], 6)
df = df[["entity", "effects"]].drop_duplicates()
df.to_csv(".\SSI ACA Data\state_effects.csv")

In [175]:
#compute wald test for range of breakpoints from 2009 through 2013 inclusive
df = pd.read_csv("./SSI ACA Data/PANEL_SSI.csv")
df = df[logical_and(df['State']!="United States", df['Year']!=2006)]
df = df.drop("Unnamed: 0", axis=1)
index = pd.MultiIndex.from_product([df.State.unique(), df.Year.unique()], names=["entity", "time"])
df = df.set_index(index)
xs.append("dummies")
for i in range(2009, 2014):
    df["dummies"] = np.where(df["Year"]>=i, 1, 0)
    model = PanelOLS(df.d_pct_BenePerCap, df[["d_pct_EmpPerCap_1", "dummies"]], entity_effects=True)
    res = model.fit(cov_type='clustered', cluster_entity=True)
    print(i)
    print(res.summary)

2009
                          PanelOLS Estimation Summary                           
Dep. Variable:       d_pct_BenePerCap   R-squared:                        0.3109
Estimator:                   PanelOLS   R-squared (Between):             -1.1584
No. Observations:                 450   R-squared (Within):               0.3109
Date:                Thu, Sep 28 2017   R-squared (Overall):             -0.6112
Time:                        02:11:50   Log-likelihood                    1285.1
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      89.793
Entities:                          50   P-value                           0.0000
Avg Obs:                       9.0000   Distribution:                   F(2,398)
Min Obs:                       9.0000                                           
Max Obs:                       9.0000   F-statistic (robust):             77.705
                       

In [164]:
df["dummies"]

entity         time
Connecticut    2007    0
               2008    0
               2009    0
               2010    0
               2011    0
               2012    0
               2013    1
               2014    1
               2015    1
Maine          2007    0
               2008    0
               2009    0
               2010    0
               2011    0
               2012    0
               2013    1
               2014    1
               2015    1
Massachusetts  2007    0
               2008    0
               2009    0
               2010    0
               2011    0
               2012    0
               2013    1
               2014    1
               2015    1
New Hampshire  2007    0
               2008    0
               2009    0
                      ..
California     2013    1
               2014    1
               2015    1
Hawaii         2007    0
               2008    0
               2009    0
               2010    0
               2011    0
     