# Project 2: Ames Housing Data and Kaggle Challenge

In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import make_column_selector
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, OrdinalEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, RidgeCV
from sklearn import metrics
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import RFE
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
import warnings
warnings.filterwarnings('ignore')
import os

# Modeling
---

In [2]:
# read in all datasets necessary for modeling
housing = pd.read_csv('cleaned_datasets/housing_preprocessed1_data.csv')
df_model_train = pd.read_csv('cleaned_datasets/housing_df_model_train.csv')
df_model_test = pd.read_csv('cleaned_datasets/housing_df_model_test.csv')
y_train = pd.read_csv('cleaned_datasets/housing_y-train.csv')
y_test = pd.read_csv('cleaned_datasets/housing_y-test_data.csv')

## Datasets for modeling
- df_model_train
- df_model_test
- y_train
- y_test

In [3]:
df_model_train_final = df_model_train.drop(columns=['pid', 'id'])
df_model_test_final = df_model_test.drop(columns=['pid', 'id'])
y_train = y_train.drop(columns=['id'])
y_test = y_test.drop(columns=['id'])

In [4]:
df_model_train_final.columns

Index(['overallqual', 'exterqual', 'kitchenqual', 'garagearea', 'garagefinish',
       'yearremod_add', 'fullbath', 'masvnrarea', 'garageyrblt',
       'totrmsabvgrd', 'heatingqc', 'neighborhood_NridgHt', 'fireplaces',
       'bsmtfinsf1', 'bsmtexposure', 'saletype_New', 'openporchsf',
       'bsmtfintype1', 'exterior2nd_VinylSd', 'lotfrontage',
       'masvnrtype_Stone', 'wooddecksf', 'halfbath', 'lotarea', 'paveddrive',
       'neighborhood_StoneBr', 'roofstyle_Hip', 'bsmtfullbath', 'centralair_Y',
       'garagetype_BuiltIn', 'masvnrtype_BrkFace', 'neighborhood_NoRidge',
       'bsmtcond', 'housestyle_2Story', 'landcontour_HLS', 'has_garage',
       'bsmtunfsf', 'exterior2nd_CmentBd', 'screenporch', 'bedroomabvgr',
       'lotconfig_CulDSac', 'neighborhood_Somerst', 'functional',
       'neighborhood_Timber', 'condition1_Norm', 'condition2_PosA',
       'neighborhood_MeadowV', 'overallcond', 'neighborhood_BrDale',
       'kitchenabvgr', 'condition1_Feedr', 'bldgtype_Twnhs', 'has_all

#### Baseline

In [5]:
# baseline
y_train.mean()[0]

181205.50218978102

#### Model 1 - Linear Regression

In [6]:
#instantiate and fit OLS model
lr = LinearRegression()
lr.fit(df_model_train_final, y_train)

In [7]:
lr.score(df_model_train_final, y_train)

0.9036883527932799

In [8]:
#calculate RMSE
y_pred = lr.predict(df_model_train_final)
mean_squared_error(y_train, y_pred, squared=False)

#rmse of lr is better than our baseline using just standard OLS model

24716.971636189173

In [9]:
y_test_pred = lr.predict(df_model_test_final)

In [10]:
#checking the model on the test data
lr.score(df_model_test_final, y_test)

0.8839212571741166

In [11]:
print('baseline:', y_train.mean())

print(" LR ".center(18, "="))
print('rmse train:', mean_squared_error(y_train, y_pred, squared=False))
print('rmse test:', mean_squared_error(y_test, y_test_pred, squared=False))
print('r2 train:', lr.score(df_model_train_final, y_train))
print('r2 test:', lr.score(df_model_test_final, y_test))

baseline: saleprice    181205.50219
dtype: float64
rmse train: 24716.971636189173
rmse test: 26742.42745018656
r2 train: 0.9036883527932799
r2 test: 0.8839212571741166


In [12]:
lr.coef_[0]

array([ 1.65131274e+04,  4.95501674e+03,  6.56775827e+03,  6.22163441e+03,
        1.14557332e+03,  1.03839192e+03,  6.93849462e+03,  7.65770700e+03,
       -7.27464922e+02,  1.25913545e+04,  1.35253727e+03,  6.92751427e+03,
        4.05998588e+03,  1.61233666e+04,  5.30533140e+03,  4.02010467e+03,
        2.67868674e+03, -1.07315126e+02, -3.84751759e+02,  4.05037683e+03,
       -6.14363524e+02,  1.87787610e+03,  4.48483914e+03,  5.98047187e+03,
        1.58524788e+03,  6.20797124e+03,  3.10034005e+03,  3.25818644e+03,
       -1.42807937e+02,  2.44934571e+03, -4.68715899e+03,  5.37061758e+03,
       -2.95357180e+03,  7.74672457e+01,  2.68138297e+03, -8.55669797e+16,
        7.85219727e+03,  2.03673104e+03,  4.73792603e+03, -1.56570407e+03,
        1.97158102e+03,  1.04610193e+03,  1.54639655e+03, -9.54586927e+02,
        2.34693397e+03,  1.62754114e+03, -6.91800497e+02,  5.04090410e+03,
        4.46318933e+02, -4.46217386e+03,  3.95480207e+02, -1.57024851e+03,
       -5.09136902e+02,  

In [13]:
pd.set_option('display.max_rows', 200)

In [14]:
len(lr.coef_[0])

73

In [38]:
lr_coef = pd.DataFrame(lr.coef_[0], df_model_train_final.columns)
lr_coef.rename(columns={0: 'coef'}, inplace=True)
lr_coef.sort_values(by='coef', ascending=False)

# for every 1 std deviation unit increase in kitchen qual, the sale price increases by $6567.8

Unnamed: 0,coef
overallqual,16513.13
bsmtfinsf1,16123.37
totrmsabvgrd,12591.35
bsmtunfsf,7852.197
masvnrarea,7657.707
fullbath,6938.495
neighborhood_NridgHt,6927.514
kitchenqual,6567.758
garagearea,6221.634
neighborhood_StoneBr,6207.971


In [16]:
pd.DataFrame(lr.predict(df_model_train_final))

Unnamed: 0,0
0,142101.990775
1,241346.946918
2,153231.293405
3,158077.554605
4,223786.946918
...,...
1365,113450.883189
1366,125357.990775
1367,147943.293405
1368,223440.843791


#### Model 2 - Ridge

In [17]:
# create alphas to try in model
r_alphas = np.logspace(0,5,100)

#instantiate ridgeCV model
ridge_cv = RidgeCV(alphas = r_alphas, scoring = 'r2')

#fit on x train data
ridge_cv.fit(df_model_train_final, y_train)

In [18]:
# Here is the optimal value of alpha - the val of alpha that gave the best cross val mean score of the alphas we tries
ridge_cv.alpha_

73.90722033525779

In [19]:
print(f"RidgeCV train r2: {ridge_cv.score(df_model_train_final, y_train)}") 
print(f"RidgeCV test r2: {ridge_cv.score(df_model_test_final, y_test)}")

RidgeCV train r2: 0.9028480235996731
RidgeCV test r2: 0.8830308527456264


In [20]:
ridge_preds = ridge_cv.predict(df_model_train_final)
RMSE_ridge_train = mean_squared_error(y_train, ridge_preds, squared=False)
RMSE_ridge_train

24824.566532863006

In [21]:
ridgetest_preds = ridge_cv.predict(df_model_test_final)
RMSE_ridge_test = mean_squared_error(y_test, ridgetest_preds, squared=False)
RMSE_ridge_test

# ridge model is better than our baseline

26844.797990760137

In [22]:
print("-----RidgeCV----- ")
print('rmse train:', RMSE_ridge_train)
print('rmse test:', RMSE_ridge_test)
print('r2 train:', lr.score(df_model_train_final, y_train))
print('r2 test:', lr.score(df_model_test_final, y_test))

-----RidgeCV----- 
rmse train: 24824.566532863006
rmse test: 26844.797990760137
r2 train: 0.9036883527932799
r2 test: 0.8839212571741166


#### Model 3 - ElasticNet

In [23]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

In [24]:
# Set up a list of alphas to check.
enet_alphas = np.linspace(0.01,0.8,100)  #(0.01, 1,100)
# if your optimal alpha is at the top or bottom of range, it indicates that you should try out different values vs if it was in middle of range

# Set up our l1 ratio.
# 0.05 ridge 0.95 lasso
enet_ratio = 0.05  #0.15, 0.40

# Instantiate model.
enet_model = ElasticNetCV(alphas=enet_alphas, l1_ratio=enet_ratio, cv=5, max_iter=3000, n_jobs=-1)

# Fit model using optimal alpha.
enet_model.fit(df_model_train_final, y_train)

In [25]:
print(enet_model.alpha_)   #print the optimal value of alpha

0.06585858585858587


In [26]:
pd.DataFrame(enet_model.predict(df_model_train_final))

Unnamed: 0,0
0,142244.297466
1,242189.252305
2,155816.838074
3,159241.791420
4,226222.123239
...,...
1365,112494.001313
1366,121707.550386
1367,146185.825748
1368,217954.694436


In [27]:
# Generate predictions
enet_preds = enet_model.predict(df_model_train_final)
enet_test_preds = enet_model.predict(df_model_test_final)

# Evaluate model
print('rmse train:', mean_squared_error(y_train, enet_preds, squared=False))
print('rmse test:', mean_squared_error(y_test, enet_test_preds, squared=False))
print('r2 train: ', enet_model.score(df_model_train_final, y_train))
print('r2 test: ', enet_model.score(df_model_test_final, y_test))

rmse train: 24853.6019255395
rmse test: 26872.022486436836
r2 train:  0.9026206282537046
r2 test:  0.8827934852840869


#### Model 4 - GridSearchCV w/ estimator - Lasso

In [28]:
#instantiate lasso
lasso = Lasso()

#instantiate gridsearch cv
# there are 20 alphas equally spaced between 10**0 to 10**5
params = {'alpha': np.logspace(2,4,50)}  # np.logspace(-2,3,20) / np.logspace(1,3,20)
gs = GridSearchCV(
        estimator = lasso,
        param_grid = params,
        n_jobs=-1)

In [29]:
gs.fit(df_model_train_final, y_train)

In [30]:
gs.score(df_model_train_final,y_train)

0.9032879838592556

In [31]:
gs.score(df_model_test_final,y_test)

0.8836262901855279

In [32]:
gs.best_params_

{'alpha': 145.63484775012444}

In [33]:
gs.get_params
# best alpha is in the top of range

<bound method BaseEstimator.get_params of GridSearchCV(estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': array([  100.        ,   109.8541142 ,   120.67926406,   132.57113656,
         145.63484775,   159.98587196,   175.75106249,   193.06977289,
         212.09508879,   232.99518105,   255.95479227,   281.1768698 ,
         308.88435965,   339.32217719,   372.75937203,   409.49150624,
         449.8432669 ,   494.17133613,   542.86754393,   596.36233166,
         655.12855686,   719.685673  ,   790.60432...
         954.09547635,  1048.11313415,  1151.39539933,  1264.85521686,
        1389.49549437,  1526.41796718,  1676.83293681,  1842.06996933,
        2023.58964773,  2222.99648253,  2442.05309455,  2682.69579528,
        2947.05170255,  3237.45754282,  3556.48030622,  3906.93993705,
        4291.93426013,  4714.86636346,  5179.47467923,  5689.86602902,
        6250.55192527,  6866.48845004,  7543.12006335,  8286.42772855,
        9102.98177992, 10000.        ])})>

In [34]:
# pd.DataFrame(gs.cv_results_)

In [35]:
# use model to predict and calculate RMSE

gs_preds_train = gs.predict(df_model_train_final)
rmse_gs_train = mean_squared_error(y_train, gs_preds_train, squared=False)

gs_preds_test = gs.predict(df_model_test_final)
rmse_gs_test = mean_squared_error(y_test, gs_preds_test, squared=False)

In [36]:
#print out coefficients of features
pd.DataFrame(gs.best_estimator_.coef_, gs.best_estimator_.feature_names_in_).sort_values(by=0, ascending=False)

Unnamed: 0,0
overallqual,16895.243011
bsmtfinsf1,15705.712874
totrmsabvgrd,12225.335655
masvnrarea,7331.365213
bsmtunfsf,7197.658957
neighborhood_NridgHt,6898.878444
fullbath,6620.989441
kitchenqual,6497.757244
neighborhood_StoneBr,6247.116842
lotarea,5962.174059


In [37]:
print('baseline:', y_train.mean())
print('                      ')
print("-----LR/OLS----- ")
print('rmse train:', mean_squared_error(y_train, y_pred, squared=False))
print('rmse test:', mean_squared_error(y_test, y_test_pred, squared=False))
print('r2 train:', lr.score(df_model_train_final, y_train))
print('r2 test:', lr.score(df_model_test_final, y_test))
print('                      ')
print("-----RidgeCV----- ")
print('rmse train:', RMSE_ridge_train)
print('rmse test:', RMSE_ridge_test)
print('r2 train:', lr.score(df_model_train_final, y_train))
print('r2 test:', lr.score(df_model_test_final, y_test))
print('                      ')
print("-----ElasticNet----- ")
print('rmse train:', mean_squared_error(y_train, enet_preds, squared=False))
print('rmse test:', mean_squared_error(y_test, enet_test_preds, squared=False))
print('r2 train: ', enet_model.score(df_model_train_final, y_train))
print('r2 test: ', enet_model.score(df_model_test_final, y_test))
print('                      ')
print("-----GridSearchCV - Lasso----- ")
print('rmse train:', rmse_gs_train)
print('rmse test:', rmse_gs_test)
print('r2 train:', gs.score(df_model_train_final,y_train))
print('r2 test:', gs.score(df_model_test_final,y_test))

baseline: saleprice    181205.50219
dtype: float64
                      
-----LR/OLS----- 
rmse train: 24716.971636189173
rmse test: 26742.42745018656
r2 train: 0.9036883527932799
r2 test: 0.8839212571741166
                      
-----RidgeCV----- 
rmse train: 24824.566532863006
rmse test: 26844.797990760137
r2 train: 0.9036883527932799
r2 test: 0.8839212571741166
                      
-----ElasticNet----- 
rmse train: 24853.6019255395
rmse test: 26872.022486436836
r2 train:  0.9026206282537046
r2 test:  0.8827934852840869
                      
-----GridSearchCV - Lasso----- 
rmse train: 24768.292763152476
rmse test: 26776.383402566353
r2 train: 0.9032879838592556
r2 test: 0.8836262901855279


## Model Choice
---
- the LR performs a bit better than all models when looking at RMSE
- R2 on training for all models is slightly similar to test r2 vals which is expected since the model was built on the training dataset
- GridSearchCV lasso did drop features which shows there are features that are not important to the prediction
- since the rmse for the lr is better on test data, I'm going to choose that model
- One downfall of the model doesn't take into account if two features work together and non-linear relationships