In [1]:
import csv
import pickle
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import recall_score, make_scorer, f1_score
from sklearn.metrics import recall_score, make_scorer, f1_score
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold, SelectKBest

In [2]:
df = pd.read_csv('../data/cleaned.csv') 

In [3]:
models_df = pd.DataFrame(columns=['score','explained_variance','mean_absolute_error','mean_squared_error','median_absolute_error','r2'],
                         index=['Model_1','Model_1_test','Model_2','Model_2_test','Model_3','Model_3_test','Model_4','Model_4_test','Model_5','Model_5_test'])

def all_the_metrics(model_name,score_1,y_t,y_p,score_2,y_t_test,y_p_test):
    models_df.loc[model_name,'score'] = score_1
    models_df.loc[model_name,'explained_variance'] = metrics.explained_variance_score(y_t,y_p)
    models_df.loc[model_name,'mean_absolute_error'] = metrics.mean_absolute_error(y_t,y_p)
    models_df.loc[model_name,'mean_squared_error'] = metrics.mean_squared_error(y_t,y_p)
    models_df.loc[model_name,'median_absolute_error'] = metrics.median_absolute_error(y_t,y_p)
    models_df.loc[model_name,'r2'] = metrics.r2_score(y_t,y_p)
    models_df.loc[model_name+"_test",'score'] = score_2
    models_df.loc[model_name+"_test",'explained_variance'] = metrics.explained_variance_score(y_t_test,y_p_test)
    models_df.loc[model_name+"_test",'mean_absolute_error'] = metrics.mean_absolute_error(y_t_test,y_p_test)
    models_df.loc[model_name+"_test",'mean_squared_error'] = metrics.mean_squared_error(y_t_test,y_p_test)
    models_df.loc[model_name+"_test",'median_absolute_error'] = metrics.median_absolute_error(y_t_test,y_p_test)
    models_df.loc[model_name+"_test",'r2'] = metrics.r2_score(y_t_test,y_p_test)
    #return models_df[model_name+"_test"]

In [4]:
cat_features = ['MS_Zoning','Street','Alley','Lot_Shape','Land_Contour','Utilities','Lot_Config','Land_Slope','Neighborhood','Condition_1','Condition_2','Bldg_Type','House_Style','Roof_Style','Roof_Matl','Exterior_1st','Exterior_2nd','Mas_Vnr_Type','Exter_Qual','Exter_Cond','Foundation','Bsmt_Qual','Bsmt_Cond','Bsmt_Exposure','BsmtFin_Type_1','BsmtFin_Type_2','Heating','Heating_QC','Central_Air','Electrical','Kitchen_Qual','Functional','Fireplace_Qu','Garage_Type','Garage_Yr_Blt','Garage_Finish','Garage_Qual','Garage_Cond','Paved_Drive','Pool_QC','Fence','Misc_Feature','Sale_Type']
df = pd.get_dummies(df, columns=cat_features)

In [5]:
all_cols = df.drop(['SalePrice','SalePrice_deskewed','Unnamed: 0', 'Id', 'PID'], 1).columns

In [6]:
X = df[all_cols]
y = df['SalePrice_deskewed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
recall_scorer = make_scorer(recall_score)

steps = [
    ("var_thres", VarianceThreshold(.05)),
    ("scaler", StandardScaler()),
    ("kbest", SelectKBest(score_func=f_regression,k=9)),
    ("encv", ElasticNetCV(cv=5,random_state=42,l1_ratio=1))
]

pipe = Pipeline(steps)

gs_model_1 = pipe.fit(X_train,y_train)
all_the_metrics("Model_1",gs_model_1.score(X_train,y_train),y_train,gs_model_1.predict(X_train),gs_model_1.score(X_test,y_test),y_test,gs_model_1.predict(X_test))

In [8]:
steps = [
    ("var_thres", VarianceThreshold(.99)),
    ("scaler", StandardScaler()),
    ("kbest", SelectKBest(score_func=f_regression,k=17)),
    ("encv", ElasticNetCV(random_state=42,l1_ratio=1))
]

pipe = Pipeline(steps)

param_grid = {
    "var_thres__threshold":[.1,.8,.95,.99],
    "encv__cv": [3,7,13]
}

gs = GridSearchCV(pipe, param_grid, verbose=1,
                      cv=3,
                      return_train_score=True,
                      n_jobs = 3)

gs_model_2 = gs.fit(X_train,y_train)
all_the_metrics("Model_2",gs_model_2.score(X_train,y_train),y_train,gs_model_2.predict(X_train),gs_model_2.score(X_test,y_test),y_test,gs_model_2.predict(X_test))

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed:    3.1s finished


In [9]:
with open('../assets/gs_model_2.pkl', 'wb+') as f:
    pickle.dump(gs_model_2, f)

In [10]:
steps = [
    ("var_thres", VarianceThreshold(.99)),
    ("scaler", StandardScaler()),
    ("kbest", SelectKBest(score_func=f_regression)),
    ("encv", ElasticNetCV(random_state=42,l1_ratio=1))
]

pipe = Pipeline(steps)

param_grid = {
    "var_thres__threshold":[.1,.8,.95,.99],
    "kbest__k": [9,13,21],
    "encv__cv": [3,7,13],
}

gs = GridSearchCV(pipe, param_grid, verbose=1,
                      cv=3,
                      return_train_score=True,
                      n_jobs = 3)

gs_model_3 = gs.fit(X_train,y_train)
all_the_metrics("Model_3",gs_model_3.score(X_train,y_train),y_train,gs_model_3.predict(X_train),gs_model_3.score(X_test,y_test),y_test,gs_model_3.predict(X_test))

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=3)]: Done  62 tasks      | elapsed:    3.4s
[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed:    6.2s finished


In [11]:
gs_model_3.best_estimator_

Pipeline(memory=None,
     steps=[('var_thres', VarianceThreshold(threshold=0.8)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kbest', SelectKBest(k=21, score_func=<function f_regression at 0x0000026FE78FCF28>)), ('encv', ElasticNetCV(alphas=None, copy_X=True, cv=7, eps=0.001, fit_intercept=True,
       l1_ratio=1, max_iter=1000, n_alphas=100, n_jobs=1, normalize=False,
       positive=False, precompute='auto', random_state=42,
       selection='cyclic', tol=0.0001, verbose=0))])

In [12]:
with open('../assets/gs_model_3.pkl', 'wb+') as f:
    pickle.dump(gs_model_3, f)

In [13]:
models_df.sort_values("r2",ascending=False)

Unnamed: 0,score,explained_variance,mean_absolute_error,mean_squared_error,median_absolute_error,r2
Model_3,0.899173,0.899173,0.0938248,0.0170187,0.0712579,0.899173
Model_3_test,0.88639,0.886558,0.0971984,0.0172371,0.0773691,0.88639
Model_2,0.883792,0.883792,0.0991255,0.0196148,0.0753939,0.883792
Model_2_test,0.87155,0.871566,0.103788,0.0194887,0.0797936,0.87155
Model_1,0.87072,0.87072,0.106896,0.0218213,0.0815007,0.87072
Model_1_test,0.856246,0.85625,0.109437,0.0218107,0.0836504,0.856246
Model_4,,,,,,
Model_4_test,,,,,,
Model_5,,,,,,
Model_5_test,,,,,,


In [14]:
with open('../assets/gs_model_3.pkl', 'wb+') as f:
    pickle.dump(gs_model_3, f)