In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, RidgeCV
import scipy.stats as stats
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score

def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true)) 

# Import Data

In [8]:
data = pd.read_pickle('/Users/patricknorman/Documents/GitHub/metis-project-2/data/dropout.pkl')

In [9]:
data['MEDIAN_VAL_TRNS'] = 1/np.sqrt(data['MEDIAN_VAL'])

In [10]:
data.columns

Index(['FIPS', 'STATE', 'COUNTY', 'CENSUS_DIV', 'JAN_TEMP', 'JAN_SUN',
       'JULY_TEMP', 'HUMIDITY', 'TOPO', 'WATER', 'LN_WATER', 'JAN TEMP - Z',
       'JAN SUN - Z', 'JUL TEMP - Z', 'JUL HUM - Z', 'TOPOG - Z',
       'LN WATER  AREA - Z', 'NAT_AMENITY', 'RANK', 'RUCC_2013', 'DENSITY',
       'CBSA', 'MEDIAN_VAL', 'CENSUS_DIV_region 2', 'CENSUS_DIV_region 3',
       'CENSUS_DIV_region 4', 'CENSUS_DIV_region 5', 'CENSUS_DIV_region 6',
       'CENSUS_DIV_region 7', 'CENSUS_DIV_region 8', 'CENSUS_DIV_region 9',
       'NAT2', 'MEDIAN_VAL_TRNS'],
      dtype='object')

# Universal Variables

In [5]:
alphavec = 10**np.linspace(-2,2,200)

---
# NATURAL FACTORS ONLY

In [37]:
nat_x = data[['JAN_TEMP', 'JAN_SUN',
       'JULY_TEMP', 'HUMIDITY', 'TOPO', 'WATER', 'LN_WATER', 'JAN TEMP - Z',
       'JAN SUN - Z', 'JUL TEMP - Z', 'JUL HUM - Z', 'TOPOG - Z',
       'LN WATER  AREA - Z', 'NAT_AMENITY', 'RANK','NAT2']]

y = data['MEDIAN_VAL_TRNS']

In [38]:
x_train, x_test, y_train, y_test = train_test_split(nat_x, y, test_size = 0.2)

In [39]:
std = StandardScaler()
std.fit(x_train.values)
x_train = std.transform(x_train.values)
x_test = std.transform(x_test.values)

In [40]:
alphavec = 10**np.linspace(-2,2,200)

ridge = RidgeCV(alphas=alphavec, cv=5)
ridge.fit(x_train, y_train)

RidgeCV(alphas=array([1.00000000e-02, 1.04737090e-02, 1.09698580e-02, 1.14895100e-02,
       1.20337784e-02, 1.26038293e-02, 1.32008840e-02, 1.38262217e-02,
       1.44811823e-02, 1.51671689e-02, 1.58856513e-02, 1.66381689e-02,
       1.74263339e-02, 1.82518349e-02, 1.91164408e-02, 2.00220037e-02,
       2.09704640e-02, 2.19638537e-02, 2.30043012e-02, 2.40940356e-02,
       2.52353917e-02, 2.64308149e-0...
       3.44896226e+01, 3.61234270e+01, 3.78346262e+01, 3.96268864e+01,
       4.15040476e+01, 4.34701316e+01, 4.55293507e+01, 4.76861170e+01,
       4.99450512e+01, 5.23109931e+01, 5.47890118e+01, 5.73844165e+01,
       6.01027678e+01, 6.29498899e+01, 6.59318827e+01, 6.90551352e+01,
       7.23263390e+01, 7.57525026e+01, 7.93409667e+01, 8.30994195e+01,
       8.70359136e+01, 9.11588830e+01, 9.54771611e+01, 1.00000000e+02]),
        cv=5)

In [41]:
ridge.alpha_

49.9450511585514

In [42]:
ridge_train_pred = ridge.predict(x_train)
ridge_test_pred = ridge.predict(x_test)

In [43]:
r2_score(y_train, ridge_train_pred)

0.3187509291739278

In [44]:
r2_score(y_test, ridge_test_pred)

0.25734431298535876

# COMBINED FACTORS

In [63]:
x = data[['JAN_TEMP', 'JAN_SUN',
       'JULY_TEMP', 'HUMIDITY', 'TOPO', 'WATER', 'LN_WATER', 'JAN TEMP - Z',
       'JAN SUN - Z', 'JUL TEMP - Z', 'JUL HUM - Z', 'TOPOG - Z',
       'LN WATER  AREA - Z', 'NAT_AMENITY', 'RANK', 'RUCC_2013', 'DENSITY',
       'CBSA', 'CENSUS_DIV_region 2', 'CENSUS_DIV_region 3',
       'CENSUS_DIV_region 4', 'CENSUS_DIV_region 5', 'CENSUS_DIV_region 6',
       'CENSUS_DIV_region 7', 'CENSUS_DIV_region 8', 'CENSUS_DIV_region 9',
       'NAT2']]

y = data[['MEDIAN_VAL']]

In [64]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [50]:
scaler = StandardScaler()
scaler.fit(x_train.values)
scaler.transform(x_train.values);
scaler.transform(x_test.values);

In [65]:
# applying boxcox transformation
y_train = 1/np.sqrt(y_train)

In [57]:
alphavec = 10**np.linspace(-3,2,200)

In [66]:
ridge = RidgeCV(alphas = alphavec, cv = 5)
ridge.fit(x_train, y_train)

RidgeCV(alphas=array([1.00000000e-03, 1.05956018e-03, 1.12266777e-03, 1.18953407e-03,
       1.26038293e-03, 1.33545156e-03, 1.41499130e-03, 1.49926843e-03,
       1.58856513e-03, 1.68318035e-03, 1.78343088e-03, 1.88965234e-03,
       2.00220037e-03, 2.12145178e-03, 2.24780583e-03, 2.38168555e-03,
       2.52353917e-03, 2.67384162e-03, 2.83309610e-03, 3.00183581e-03,
       3.18062569e-03, 3.37006433e-0...
       2.64308149e+01, 2.80050389e+01, 2.96730241e+01, 3.14403547e+01,
       3.33129479e+01, 3.52970730e+01, 3.73993730e+01, 3.96268864e+01,
       4.19870708e+01, 4.44878283e+01, 4.71375313e+01, 4.99450512e+01,
       5.29197874e+01, 5.60716994e+01, 5.94113398e+01, 6.29498899e+01,
       6.66991966e+01, 7.06718127e+01, 7.48810386e+01, 7.93409667e+01,
       8.40665289e+01, 8.90735464e+01, 9.43787828e+01, 1.00000000e+02]),
        cv=5)

In [67]:
ridge.alpha_

0.008026433522257174

In [68]:
ridge_tr_pred = ridge.predict(x_train)
ridge_test_pred = ridge.predict(x_test)

In [69]:
r2_score(y_train, ridge_tr_pred)

0.6898688953458372

In [70]:
r2_score(y_test, ridge_test_pred)

-10.369085612005916