In [2]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
#from sklearn import linear_model as lm
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv("crime.csv")

# Classical statistics approach
### Forward selection

In [30]:
null_lm = ols('murder_rate ~ 1', data=dataset).fit()

# dictionary of simple regressions
single_var_models = {
    "poverty": ols('murder_rate ~ poverty', data=dataset).fit(),
    "high_school": ols('murder_rate ~ high_school', data=dataset).fit(),
    "college": ols('murder_rate ~ college', data=dataset).fit(),
    "single_parent": ols('murder_rate ~ single_parent', data=dataset).fit(),
    "unemployed": ols('murder_rate ~ unemployed', data=dataset).fit(),
    "metropolitan": ols('murder_rate ~ metropolitan', data=dataset).fit(),
    "region": ols('murder_rate ~ region', data=dataset).fit() # baseline: north-central
}


In [32]:

for key in single_var_models:
    results = sm.stats.anova_lm(single_var_models[key], typ=3)
    print(results)
    # R^2 = regresion SS (intercept + explanatory variable SS) / total SS
    print("R^2 =", round((results["sum_sq"][1]/results["sum_sq"].sum()),3), "\n")

               sum_sq    df          F    PR(>F)
Intercept    0.565516   1.0   0.107489  0.744447
poverty     56.224089   1.0  10.686658  0.001999
Residual   252.535111  48.0        NaN       NaN
R^2 = 0.182 

                 sum_sq    df          F        PR(>F)
Intercept    145.505447   1.0  35.493964  2.909426e-07
high_school  111.985984   1.0  27.317372  3.710570e-06
Residual     196.773216  48.0        NaN           NaN
R^2 = 0.247 

               sum_sq    df          F    PR(>F)
Intercept   92.095520   1.0  15.168256  0.000304
college     17.322599   1.0   2.853055  0.097687
Residual   291.436601  48.0        NaN       NaN
R^2 = 0.043 

                   sum_sq    df          F        PR(>F)
Intercept       57.354449   1.0  16.474017  1.808947e-04
single_parent  141.646736   1.0  40.685435  6.605019e-08
Residual       167.112464  48.0        NaN           NaN
R^2 = 0.387 

                sum_sq    df         F    PR(>F)
Intercept     2.575526   1.0  0.456709  0.502409
unempl

The highest $R^2$ value is 0.387, belonging to the `single_parent` variable, so we'll start with that in the model (I'm going to assume, for this data, that this is better than the null model).

In [37]:
present_model = single_var_models["single_parent"]

Now let's add another variable and see how these two-varable models compare to our present model. I'll use $F$-tests to measure the improvement.

In [40]:
double_var_models = {
    "poverty": ols('murder_rate ~ single_parent + poverty', data=dataset).fit(),
    "high_school": ols('murder_rate ~ single_parent + high_school', data=dataset).fit(),
    "college": ols('murder_rate ~ single_parent + college', data=dataset).fit(),
    "unemployed": ols('murder_rate ~ single_parent + unemployed', data=dataset).fit(),
    "metropolitan": ols('murder_rate ~ single_parent + metropolitan', data=dataset).fit(),
    "region": ols('murder_rate ~ single_parent + region', data=dataset).fit() # baseline: north-central
}

In [46]:

for key in double_var_models:
    comp_results = sm.stats.anova_lm(present_model, double_var_models[key], test="F")
    print(key)
    print(comp_results, "\n")



poverty
   df_resid         ssr  df_diff   ss_diff         F    Pr(>F)
0      48.0  167.112464      0.0       NaN       NaN       NaN
1      47.0  161.538532      1.0  5.573931  1.621748  0.209111 

high_school
   df_resid         ssr  df_diff    ss_diff         F    Pr(>F)
0      48.0  167.112464      0.0        NaN       NaN       NaN
1      47.0  147.687917      1.0  19.424547  6.181641  0.016519 

college
   df_resid         ssr  df_diff   ss_diff         F    Pr(>F)
0      48.0  167.112464      0.0       NaN       NaN       NaN
1      47.0  166.614832      1.0  0.497632  0.140376  0.709593 

unemployed
   df_resid         ssr  df_diff   ss_diff         F    Pr(>F)
0      48.0  167.112464      0.0       NaN       NaN       NaN
1      47.0  162.475730      1.0  4.636733  1.341286  0.252659 

metropolitan
   df_resid         ssr  df_diff    ss_diff         F    Pr(>F)
0      48.0  167.112464      0.0        NaN       NaN       NaN
1      47.0  152.211436      1.0  14.901028  4.601154