### Ames Housing Dataset - ML Project - Martin Kihn / NYCDSA - January 2021

### Ames ML Project EDA - Notebook #4

In [1]:
#This notebook has steps for baseline model, feature selection and modeling

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 2000)

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('darkgrid')

path_to_train = '~/Desktop/Ames_ML_Project/Data/train_dummied_df_01_18_8p.csv'

In [2]:
df_train = pd.read_csv(path_to_train, index_col = False)
df_train.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
df_train.sample(3)

Unnamed: 0,LotFrontage,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,WoodDeckSF,OpenPorchSF,MoSold,YrSold,SalePrice,NeighType,SalePrice_Log,Age,Baths,BsmtFinSF,MSZoning_Oth,MSZoning_RL,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Lvl,LandContour_Oth,LotConfig_Inside,LotConfig_Oth,LandSlope_Gtl,LandSlope_Oth,BldgType_1Fam,BldgType_Oth,RoofStyle_Gable,RoofStyle_Oth,MasVnrType_Brick,MasVnrType_Oth,ExterQual_Avg,ExterQual_Ex,ExterQual_Gd,ExterCond_Avg,ExterCond_Ex,ExterCond_Gd,Foundation_CBlock,Foundation_Oth,Foundation_PConc,BsmtQual_Avg,BsmtQual_Gd,BsmtExposure_Yes,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_None,BsmtFinType1_Rec,HeatingQC_Avg,HeatingQC_Gd,CentralAir_Yes,Electrical_Yes,KitchenQual_Avg,KitchenQual_Gd,GarageType_Attchd,GarageType_None,GarageType_Oth,PavedDrive_Yes,Fence_Yes,2Story_Y,RoadRail_Yes,Exterior_BrkComm,Exterior_CemntBd,Exterior_HdBoard,Exterior_MetalSd,Exterior_Other,Exterior_VinylSd,Exterior_Wd Sdng,Exterior_WdShing
501,-0.236374,0.635894,-0.672868,1.107774,0.964883,-0.570199,0.335686,-0.396793,0.207919,-0.218266,0.588597,0.210908,-0.452421,1.498314,-0.857298,-1.377502,-0.07882,1.46208,0.114682,-0.548883,-0.121166,-1.452915,1,0,0,0,0,1,1,0,1,0,1,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0
13,1.005743,0.635894,-0.672868,1.141229,1.062585,1.110994,0.497028,0.096372,0.207919,0.399825,0.588597,0.618499,0.519293,-0.21685,0.627011,-0.621834,1.217964,0.488069,1.291127,-0.507804,-0.121166,-1.452915,0,1,1,0,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,1,1,0,0,0,0,1,0,0,1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0
1140,0.194243,-1.018145,-0.672868,-0.063127,-0.744896,-0.570199,-0.00045,-1.488913,0.207919,-0.931102,-0.956888,-0.081489,-0.755086,-0.726764,0.255934,-1.377502,-0.829953,-0.485941,-0.944615,-0.548883,-1.84105,-1.452915,0,1,0,0,0,1,1,0,1,0,1,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [250]:
# let's start with a simple model - containing only a few features
# that I know are import for predicting saleprice
# these are: Overall_Qual, Gr_Liv_Area, Age, Baths, BsmtFinSF, GarageArea
# then add: Exter_Qual_Ex, ExterCond_Ex, KitchenQual_Gd, MasVnrArea, OpenPorchSF
# then add my new features: 2Story_Y, RoadRail_None, Neigh_Type

In [4]:
# but first - I will separate the SalePrice_Log as Target
target_log = df_train['SalePrice_Log']
target = df_train['SalePrice']

In [5]:
X = df_train.copy()
X.drop(['SalePrice_Log', 'SalePrice'], axis=1, inplace=True)

In [6]:
X.shape

(1389, 77)

In [7]:
target_log = list(target_log)
len(target_log)

1389

In [8]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm

In [9]:
# first try linear regression - baseline - with all the features
ols = linear_model.LinearRegression()
ols_model = ols.fit(X, target_log)

In [10]:
print('RSS: %.2f' % np.sum((ols.predict(X) - target_log) ** 2))
print('R^2: %.5f' % ols.score(X, target_log))

RSS: 138.78
R^2: 0.90009


In [11]:
# with no tuning or pruning - OLS model yielded R2 of 0.90 - not bad
# now for curiosity I'd like to try my smaller models

In [12]:
base_features_1 = ['OverallQual', 'GrLivArea', 'Age', 'Baths', 'BsmtFinSF', 'GarageArea']
base_features_2 = base_features_1 + ['ExterQual_Ex', 'ExterCond_Ex', 'KitchenQual_Gd', 'MasVnrArea', 'OpenPorchSF']
base_features_3 = base_features_2 + ['2Story_Y', 'RoadRail_None', 'Neigh_Type']

In [13]:
X_base_1 = X[X.columns.intersection(base_features_1)]

In [14]:
X_base_2 = X[X.columns.intersection(base_features_2)]

In [15]:
X_base_3 = X[X.columns.intersection(base_features_3)]

In [16]:
ols_model_1 = ols.fit(X_base_1, target_log)
print('RSS: %.2f' % np.sum((ols.predict(X_base_1) - target_log) ** 2))
print('R^2: %.5f' % ols.score(X_base_1, target_log))

RSS: 264.33
R^2: 0.80970


In [17]:
def ols_model(data):
    ols = linear_model.LinearRegression()
    ols_mod = ols.fit(data, target_log)
    print('RSS: %.2f' % np.sum((ols.predict(data) - target_log) ** 2))
    print('R^2: %.5f' % ols.score(data, target_log))
    return

In [18]:
print('X_base_1:')
ols_model(X_base_1)
print('\nX_base_2:')
ols_model(X_base_2)
print('\nX_base_3')
ols_model(X_base_3)

X_base_1:
RSS: 264.33
R^2: 0.80970

X_base_2:
RSS: 246.70
R^2: 0.82239

X_base_3
RSS: 225.98
R^2: 0.83731


In [266]:
# This is promising - my smaller subsets appear to add significant value
# Let's go back to the full set and see if we can improve predictions

In [267]:
X.shape

(1389, 77)

In [268]:
# feature selection: lasso; coef_; tree models; SelectFromModelsAPI
# I will start with using 'Lasso' - then do F test

In [19]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
ridge = Ridge()
lasso = Lasso()
elasticnet = ElasticNet()

In [22]:
ridge.fit(X, target_log)
print('The Ridge intercept is: %.2f' % (ridge.intercept_))
print('The Ridge R2 is: %.2f' % (ridge.score(X, target_log)))
pd.Series(ridge.coef_, index=X.columns).sort_values(ascending=False)

The Ridge intercept is: 0.23
The Ridge R2 is: 0.90


GrLivArea            0.369638
GarageType_None      0.242867
BsmtFinType1_None    0.237214
OverallQual          0.216890
BsmtFinSF            0.203972
LotShape_IR2         0.165851
GarageArea           0.163364
CentralAir_Yes       0.134487
NeighType            0.133747
OverallCond          0.128206
TotalBsmtSF          0.112765
Exterior_MetalSd     0.098008
Exterior_Other       0.086442
Exterior_WdShing     0.085519
Exterior_CemntBd     0.085077
ExterQual_Ex         0.083393
BsmtExposure_Yes     0.081241
Exterior_VinylSd     0.077640
YearBuilt            0.074029
YearRemodAdd         0.072193
Foundation_PConc     0.066201
Exterior_Wd Sdng     0.062289
Fireplaces           0.060065
BldgType_1Fam        0.058628
MSZoning_RL          0.057978
ExterCond_Ex         0.053283
Baths                0.048146
LotShape_IR1         0.035915
LotShape_Reg         0.034458
WoodDeckSF           0.028857
MasVnrType_Oth       0.028230
PavedDrive_Yes       0.025429
LandSlope_Oth        0.024694
LotFrontag

In [271]:
alphas = np.linspace(1, 100, 20)
ridge.set_params(normalize=False)
coefs_ridge = []

for alpha in alphas:
    ridge.set_params(alpha=alpha)
    ridge.fit(X, target_log)
    coefs_ridge.append(ridge.coef_)

In [272]:
coefs_ridge = pd.DataFrame(coefs_ridge, index = alphas, columns =\
                          X.columns)

In [273]:
coefs_ridge

Unnamed: 0,LotFrontage,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,WoodDeckSF,OpenPorchSF,MoSold,YrSold,NeighType,Age,Baths,BsmtFinSF,MSZoning_Oth,MSZoning_RL,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Lvl,LandContour_Oth,LotConfig_Inside,LotConfig_Oth,LandSlope_Gtl,LandSlope_Oth,BldgType_1Fam,BldgType_Oth,RoofStyle_Gable,RoofStyle_Oth,MasVnrType_Brick,MasVnrType_Oth,ExterQual_Avg,ExterQual_Ex,ExterQual_Gd,ExterCond_Avg,ExterCond_Ex,ExterCond_Gd,Foundation_CBlock,Foundation_Oth,Foundation_PConc,BsmtQual_Avg,BsmtQual_Gd,BsmtExposure_Yes,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_None,BsmtFinType1_Rec,HeatingQC_Avg,HeatingQC_Gd,CentralAir_Yes,Electrical_Yes,KitchenQual_Avg,KitchenQual_Gd,GarageType_Attchd,GarageType_None,GarageType_Oth,PavedDrive_Yes,Fence_Yes,2Story_Y,RoadRail_Yes,Exterior_BrkComm,Exterior_CemntBd,Exterior_HdBoard,Exterior_MetalSd,Exterior_Other,Exterior_VinylSd,Exterior_Wd Sdng,Exterior_WdShing
1.0,0.023491,0.21689,0.128206,0.074029,0.072193,0.016104,0.112765,0.369638,-0.0061,0.012301,0.060065,0.163364,0.028857,0.010436,0.001038,-0.02017,0.133747,-0.030194,0.048146,0.203972,-0.057978,0.057978,0.035915,0.165851,-0.236225,0.034458,0.009848,-0.009848,-0.005238,0.005238,-0.024694,0.024694,0.058628,-0.058628,-0.019825,0.019825,-0.027471,0.02823,-0.0326,0.083393,-0.050792,0.007665,0.053283,-0.060948,-0.026708,-0.039493,0.066201,-0.389085,-0.390264,0.081241,-0.042679,-0.035444,-0.029627,-0.100598,0.237214,-0.028866,-0.014958,0.014958,0.134487,-0.012313,-0.010824,0.010824,-0.125609,0.242867,-0.117258,0.025429,-0.039256,-0.149593,-0.140022,-0.510562,0.085077,0.015586,0.098008,0.086442,0.07764,0.062289,0.085519
6.210526,0.026557,0.218821,0.126678,0.07381,0.07159,0.02013,0.065983,0.371601,-0.008322,0.017158,0.060526,0.114446,0.030227,0.012463,0.001325,-0.020692,0.135293,-0.030432,0.055055,0.156756,-0.057632,0.057632,0.016255,0.130042,-0.160685,0.014388,0.010317,-0.010317,-0.005828,0.005828,-0.025326,0.025326,0.05814,-0.05814,-0.021717,0.021717,-0.029895,0.032344,-0.032221,0.081383,-0.049162,0.024755,0.020921,-0.045676,-0.027112,-0.038348,0.06546,-0.097417,-0.096885,0.081764,-0.026898,-0.020241,-0.014514,-0.077906,0.151838,-0.012278,-0.01548,0.01548,0.125308,-0.000975,-0.012439,0.012439,-0.055491,0.105493,-0.050003,0.02164,-0.039327,-0.170436,-0.134481,-0.188272,0.041591,-0.029261,0.053321,0.044307,0.031232,0.016955,0.030127
11.421053,0.02845,0.218938,0.125206,0.073496,0.071587,0.022053,0.060774,0.363701,-0.009335,0.02131,0.061525,0.101134,0.030967,0.013313,0.001686,-0.020741,0.135726,-0.03005,0.057966,0.132041,-0.057169,0.057169,0.007321,0.110073,-0.122021,0.004627,0.010202,-0.010202,-0.00618,0.00618,-0.025679,0.025679,0.057187,-0.057187,-0.023286,0.023286,-0.030433,0.033335,-0.030551,0.077427,-0.046876,0.028387,0.012596,-0.040983,-0.02698,-0.037322,0.064303,-0.060838,-0.058219,0.081512,-0.018739,-0.012875,-0.006357,-0.065183,0.107711,-0.004557,-0.015976,0.015976,0.117726,0.003009,-0.013652,0.013652,-0.034958,0.066582,-0.031624,0.021772,-0.038312,-0.167354,-0.130018,-0.114733,0.032357,-0.038738,0.042483,0.035259,0.019549,0.007055,0.016771
16.631579,0.030048,0.218724,0.123741,0.073251,0.071706,0.023497,0.059226,0.355436,-0.009966,0.025093,0.062561,0.095062,0.031504,0.013962,0.00201,-0.020691,0.13583,-0.029648,0.059695,0.11748,-0.05677,0.05677,0.002537,0.096804,-0.098388,-0.000953,0.009804,-0.009804,-0.006399,0.006399,-0.02571,0.02571,0.056076,-0.056076,-0.024554,0.024554,-0.030653,0.033542,-0.029142,0.073811,-0.044669,0.029554,0.008919,-0.038473,-0.026776,-0.036256,0.063032,-0.046701,-0.041968,0.081,-0.014043,-0.00894,-0.001482,-0.056727,0.08164,-0.000449,-0.016443,0.016443,0.111057,0.005296,-0.01475,0.01475,-0.024624,0.047829,-0.023205,0.022283,-0.037371,-0.161959,-0.125922,-0.082214,0.02837,-0.042464,0.037235,0.031612,0.013916,0.002828,0.010717
21.842105,0.031482,0.218356,0.122305,0.073034,0.071864,0.024694,0.058756,0.347486,-0.010415,0.02857,0.063567,0.09168,0.031941,0.014519,0.002302,-0.020607,0.135793,-0.029249,0.060902,0.1079,-0.056385,0.056385,-0.000218,0.087123,-0.082437,-0.004469,0.009284,-0.009284,-0.006545,0.006545,-0.025574,0.025574,0.054936,-0.054936,-0.02562,0.02562,-0.030706,0.033448,-0.028014,0.070622,-0.042608,0.029883,0.006879,-0.036761,-0.026547,-0.035215,0.061762,-0.039346,-0.032556,0.080377,-0.011025,-0.006635,0.001783,-0.050514,0.064434,0.001958,-0.016873,0.016873,0.105199,0.00684,-0.015763,0.015763,-0.018185,0.03667,-0.018485,0.02282,-0.03651,-0.15613,-0.122093,-0.063907,0.026087,-0.044204,0.033932,0.029781,0.010493,0.000528,0.00729
27.052632,0.032798,0.217892,0.120903,0.072827,0.072029,0.025732,0.058727,0.339979,-0.010756,0.031774,0.064527,0.089582,0.032318,0.015017,0.002568,-0.020509,0.135683,-0.028856,0.061831,0.101109,-0.056001,0.056001,-0.001855,0.079634,-0.070943,-0.006835,0.008717,-0.008717,-0.006648,0.006648,-0.025346,0.025346,0.053817,-0.053817,-0.026538,0.026538,-0.030649,0.033221,-0.027122,0.067816,-0.040693,0.029843,0.005592,-0.035435,-0.026308,-0.034226,0.060534,-0.034932,-0.026173,0.079702,-0.008936,-0.00521,0.004149,-0.045667,0.052214,0.003449,-0.017266,0.017266,0.100033,0.007967,-0.016704,0.016704,-0.013672,0.029206,-0.015534,0.023301,-0.035713,-0.150356,-0.118499,-0.052181,0.024552,-0.045027,0.031543,0.028749,0.008152,-0.000896,0.005108
32.263158,0.034017,0.217362,0.11954,0.072623,0.072188,0.026656,0.058906,0.332928,-0.011027,0.034734,0.065439,0.088192,0.032654,0.015474,0.002813,-0.020405,0.135531,-0.028471,0.062595,0.096035,-0.055613,0.055613,-0.002829,0.073604,-0.062266,-0.008509,0.008136,-0.008136,-0.006722,0.006722,-0.025066,0.025066,0.052738,-0.052738,-0.02734,0.02734,-0.030514,0.032935,-0.026421,0.065332,-0.038911,0.029618,0.004712,-0.03433,-0.026066,-0.033295,0.059361,-0.032054,-0.021419,0.079001,-0.007411,-0.004298,0.005963,-0.041733,0.043076,0.004402,-0.017625,0.017625,0.09545,0.008828,-0.017581,0.017581,-0.01027,0.023828,-0.013558,0.023709,-0.034966,-0.144801,-0.115118,-0.044036,0.02341,-0.045356,0.029665,0.028121,0.006435,-0.001852,0.003613
37.473684,0.035153,0.216783,0.118214,0.072417,0.072338,0.027489,0.059189,0.326313,-0.011248,0.037475,0.066303,0.087228,0.032959,0.015897,0.00304,-0.0203,0.135352,-0.028093,0.063251,0.092089,-0.055223,0.055223,-0.003386,0.068607,-0.055482,-0.009738,0.00756,-0.00756,-0.006777,0.006777,-0.024756,0.024756,0.051706,-0.051706,-0.028049,0.028049,-0.030321,0.032622,-0.025871,0.063118,-0.037247,0.029293,0.004075,-0.033367,-0.025825,-0.032424,0.058249,-0.030074,-0.017659,0.078289,-0.006252,-0.003706,0.007414,-0.038454,0.035979,0.005019,-0.017953,0.017953,0.091357,0.009507,-0.018403,0.018403,-0.007572,0.019746,-0.012174,0.024046,-0.034262,-0.139521,-0.111932,-0.038056,0.022499,-0.04539,0.028108,0.027714,0.005117,-0.002529,0.002535
42.684211,0.036216,0.216169,0.116926,0.072211,0.072477,0.028251,0.059526,0.320105,-0.01143,0.04002,0.067121,0.086538,0.033241,0.016293,0.003251,-0.020193,0.135157,-0.027723,0.063834,0.088924,-0.054831,0.054831,-0.003672,0.064375,-0.050032,-0.010671,0.007,-0.007,-0.006819,0.006819,-0.02443,0.02443,0.050724,-0.050724,-0.02868,0.02868,-0.030084,0.032301,-0.025444,0.061132,-0.035688,0.028911,0.003593,-0.032504,-0.025588,-0.031609,0.057197,-0.028662,-0.01456,0.077575,-0.005344,-0.003321,0.008613,-0.035665,0.030302,0.005416,-0.018254,0.018254,0.087679,0.010052,-0.019175,0.019175,-0.005354,0.016528,-0.011173,0.024317,-0.033594,-0.134529,-0.108925,-0.033481,0.021739,-0.045238,0.02677,0.027434,0.004075,-0.003029,0.00173
47.894737,0.037214,0.215527,0.115674,0.072003,0.072604,0.028954,0.059887,0.314271,-0.011583,0.042386,0.067896,0.086032,0.033505,0.016665,0.003448,-0.020087,0.13495,-0.027361,0.064362,0.086323,-0.054437,0.054437,-0.003776,0.06073,-0.045557,-0.011397,0.006459,-0.006459,-0.00685,0.00685,-0.024097,0.024097,0.049791,-0.049791,-0.029243,0.029243,-0.029814,0.031981,-0.025117,0.059338,-0.034221,0.028498,0.003217,-0.031716,-0.025355,-0.030848,0.056203,-0.027628,-0.011928,0.076865,-0.004615,-0.003076,0.009628,-0.033258,0.025656,0.005665,-0.018531,0.018531,0.084355,0.010498,-0.019902,0.019902,-0.003483,0.013915,-0.010432,0.02453,-0.032958,-0.12982,-0.106083,-0.029871,0.021082,-0.044964,0.025591,0.027229,0.003232,-0.003409,0.001111


In [24]:
# Next I will try Lasso with some different tuning alphas
alphas = np.linspace(0.01, 0.2, 20)
lasso.set_params(normalize=False)
coefs_lasso = []

for alpha in alphas:
    lasso.set_params(alpha=alpha)
    lasso.fit(X, target_log)
    print('Alpha is: %.2f' % (alpha))
    print('The Lasso intercept is: %.2f' % (lasso.intercept_))
    print('The Lasso R2 is: %.2f' % (lasso.score(X, target_log)))
    coefs_lasso.append(lasso.coef_)

Alpha is: 0.01
The Lasso intercept is: 0.03
The Lasso R2 is: 0.89
Alpha is: 0.02
The Lasso intercept is: 0.02
The Lasso R2 is: 0.88
Alpha is: 0.03
The Lasso intercept is: -0.00
The Lasso R2 is: 0.87
Alpha is: 0.04
The Lasso intercept is: -0.00
The Lasso R2 is: 0.87
Alpha is: 0.05
The Lasso intercept is: -0.00
The Lasso R2 is: 0.86
Alpha is: 0.06
The Lasso intercept is: -0.00
The Lasso R2 is: 0.85
Alpha is: 0.07
The Lasso intercept is: -0.00
The Lasso R2 is: 0.85
Alpha is: 0.08
The Lasso intercept is: -0.00
The Lasso R2 is: 0.84
Alpha is: 0.09
The Lasso intercept is: -0.00
The Lasso R2 is: 0.84
Alpha is: 0.10
The Lasso intercept is: -0.00
The Lasso R2 is: 0.83
Alpha is: 0.11
The Lasso intercept is: -0.00
The Lasso R2 is: 0.82
Alpha is: 0.12
The Lasso intercept is: -0.00
The Lasso R2 is: 0.81
Alpha is: 0.13
The Lasso intercept is: -0.00
The Lasso R2 is: 0.81
Alpha is: 0.14
The Lasso intercept is: -0.00
The Lasso R2 is: 0.80
Alpha is: 0.15
The Lasso intercept is: -0.00
The Lasso R2 is: 0.

In [275]:
coefs_lasso = pd.DataFrame(coefs_lasso, index = alphas, columns = \
                          X.columns)

In [276]:
coefs_lasso

Unnamed: 0,LotFrontage,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,GrLivArea,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,WoodDeckSF,OpenPorchSF,MoSold,YrSold,NeighType,Age,Baths,BsmtFinSF,MSZoning_Oth,MSZoning_RL,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Lvl,LandContour_Oth,LotConfig_Inside,LotConfig_Oth,LandSlope_Gtl,LandSlope_Oth,BldgType_1Fam,BldgType_Oth,RoofStyle_Gable,RoofStyle_Oth,MasVnrType_Brick,MasVnrType_Oth,ExterQual_Avg,ExterQual_Ex,ExterQual_Gd,ExterCond_Avg,ExterCond_Ex,ExterCond_Gd,Foundation_CBlock,Foundation_Oth,Foundation_PConc,BsmtQual_Avg,BsmtQual_Gd,BsmtExposure_Yes,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_None,BsmtFinType1_Rec,HeatingQC_Avg,HeatingQC_Gd,CentralAir_Yes,Electrical_Yes,KitchenQual_Avg,KitchenQual_Gd,GarageType_Attchd,GarageType_None,GarageType_Oth,PavedDrive_Yes,Fence_Yes,2Story_Y,RoadRail_Yes,Exterior_BrkComm,Exterior_CemntBd,Exterior_HdBoard,Exterior_MetalSd,Exterior_Other,Exterior_VinylSd,Exterior_Wd Sdng,Exterior_WdShing
0.01,0.045516,0.251862,0.103985,0.130961,0.065595,0.011655,0.057619,0.369693,-0.0,0.0,0.065495,0.080642,0.031483,0.005864,0.0,-0.009879,0.14517,-0.0,0.049943,0.078323,-0.073731,7.882989e-18,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.04055,-3.7284250000000004e-17,-0.009753,2.9851310000000005e-17,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.030986,-0.0,0.0,0.060126,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.015351,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.135817,-0.048148,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0
0.02,0.062201,0.265204,0.080296,0.129182,0.074556,0.006237,0.060946,0.336361,-0.0,0.0,0.068408,0.079433,0.03252,0.0,0.0,-0.00089,0.14689,-0.0,0.048228,0.080561,-0.028787,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.029993,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.066541,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
0.03,0.070371,0.271813,0.0574,0.119318,0.077885,0.003197,0.060654,0.303644,-0.0,0.0,0.070442,0.077432,0.032579,0.0,0.0,-0.0,0.145181,-0.0,0.050066,0.083703,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
0.04,0.064778,0.274013,0.036616,0.100723,0.08127,0.0,0.0575,0.298205,-0.0,0.0,0.067072,0.075453,0.028507,0.0,0.0,-0.0,0.143784,-0.0,0.053328,0.078289,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
0.05,0.059153,0.275455,0.015777,0.081513,0.085091,0.0,0.054335,0.29201,-0.0,0.0,0.063637,0.073478,0.024373,0.0,0.0,-0.0,0.142264,-0.0,0.056847,0.072453,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
0.06,0.053436,0.276966,0.0,0.06625,0.086455,0.0,0.051149,0.286425,-0.0,0.0,0.06005,0.071091,0.019914,0.0,0.0,-0.0,0.140963,-0.0,0.060546,0.066073,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
0.07,0.047439,0.278746,0.0,0.063287,0.080167,0.0,0.047894,0.282729,-0.0,0.0,0.055988,0.067423,0.014443,0.0,0.0,-0.0,0.140308,-0.0,0.064805,0.058,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
0.08,0.041443,0.280527,0.0,0.060324,0.07388,0.0,0.04464,0.279033,-0.0,0.0,0.051925,0.063755,0.008971,0.0,0.0,-0.0,0.139653,-0.0,0.069063,0.049927,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
0.09,0.035448,0.282308,0.0,0.05736,0.067592,0.0,0.041385,0.275337,-0.0,0.0,0.047862,0.060087,0.003499,0.0,0.0,-0.0,0.138998,-0.0,0.073322,0.041853,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
0.1,0.029466,0.284108,0.0,0.054243,0.06118,0.0,0.038045,0.271453,0.0,0.0,0.043629,0.056354,0.0,0.0,0.0,-0.0,0.138377,-0.0,0.077351,0.033711,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0


In [277]:
elasticnet = ElasticNet(alpha=0.1, l1_ratio=0.5, normalize=False)
elasticnet.fit(X, target_log)
print(r'The R^2 is %.2f' %(elasticnet.score(X, target_log)))
print('The intercept is: %.2f' % (elasticnet.intercept_))
coefs_elasticnet = pd.DataFrame(elasticnet.coef_, index=X.columns.T)

The R^2 is 0.86


In [278]:
coefs_elasticnet = coefs_elasticnet.rename(columns={0 : 'Coef'})
coefs_elasticnet.Coef.sort_values(ascending = False)[:15]

GrLivArea       0.271137
OverallQual     0.261563
NeighType       0.142273
YearRemodAdd    0.088494
YearBuilt       0.075806
GarageArea      0.074909
Baths           0.071381
Fireplaces      0.069405
BsmtFinSF       0.062503
LotFrontage     0.061844
TotalBsmtSF     0.057195
WoodDeckSF      0.026114
OverallCond     0.010997
MasVnrArea      0.003994
TotRmsAbvGrd    0.000000
Name: Coef, dtype: float64

In [279]:
# This makes more sense - I will go through a few alphas for tuning
from sklearn.model_selection import GridSearchCV

parametersGrid = {"alpha" : [0.001, 0.01, 0.1, 1],
                 "l1_ratio" : np.arange(0.1, 1.0, 0.1)}
enet = ElasticNet()
grid = GridSearchCV(enet, parametersGrid, cv=10)
grid.fit(X, target_log)

enet_grid = pd.DataFrame(grid.cv_results_)

In [280]:
enet_grid.sort_values(by=['rank_test_score'])[:10]
# winner is alpha 0.01, l1 ratio 0.2
# scores 90-91

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
10,0.005394,0.000221,0.001872,0.000138,0.01,0.2,"{'alpha': 0.01, 'l1_ratio': 0.2}",0.911232,0.912001,0.90233,0.859999,0.903652,0.88633,0.889216,0.907514,0.763733,0.889886,0.882589,0.042295,1
11,0.005953,0.001751,0.001835,0.000113,0.01,0.3,"{'alpha': 0.01, 'l1_ratio': 0.30000000000000004}",0.911068,0.911174,0.902496,0.861379,0.902607,0.884496,0.888262,0.907604,0.765355,0.889808,0.882425,0.041632,2
9,0.008112,0.001044,0.001982,0.000145,0.01,0.1,"{'alpha': 0.01, 'l1_ratio': 0.1}",0.911355,0.912457,0.901586,0.857757,0.904445,0.888327,0.889908,0.906993,0.760621,0.888838,0.882229,0.043325,3
8,0.021487,0.001895,0.00207,0.000143,0.001,0.9,"{'alpha': 0.001, 'l1_ratio': 0.9}",0.9118,0.912842,0.898107,0.855726,0.906509,0.888876,0.891641,0.905646,0.759064,0.888935,0.881915,0.043875,4
12,0.005454,0.00026,0.001922,0.000156,0.01,0.4,"{'alpha': 0.01, 'l1_ratio': 0.4}",0.910424,0.91065,0.901997,0.862511,0.901726,0.882785,0.886657,0.907207,0.765311,0.889094,0.881836,0.041378,5
7,0.021674,0.001369,0.002041,9.1e-05,0.001,0.8,"{'alpha': 0.001, 'l1_ratio': 0.8}",0.911816,0.912815,0.897912,0.855289,0.90644,0.889046,0.891396,0.905556,0.758642,0.888684,0.88176,0.043996,6
6,0.019365,0.001148,0.002157,0.000173,0.001,0.7,"{'alpha': 0.001, 'l1_ratio': 0.7000000000000001}",0.911829,0.912786,0.897717,0.854838,0.906373,0.889197,0.891073,0.905454,0.758194,0.888417,0.881588,0.044123,7
5,0.017519,0.001219,0.002149,0.00021,0.001,0.6,"{'alpha': 0.001, 'l1_ratio': 0.6}",0.911895,0.912791,0.897499,0.854374,0.906304,0.889311,0.890729,0.905398,0.757591,0.888197,0.881409,0.044301,8
4,0.01665,0.001435,0.002206,0.000247,0.001,0.5,"{'alpha': 0.001, 'l1_ratio': 0.5}",0.911931,0.912905,0.897272,0.854051,0.906248,0.889536,0.890364,0.905342,0.755874,0.887995,0.881152,0.04479,9
13,0.00587,0.000559,0.001969,0.000182,0.01,0.5,"{'alpha': 0.01, 'l1_ratio': 0.5}",0.909634,0.910176,0.901878,0.863279,0.90067,0.881065,0.884787,0.90689,0.76471,0.887872,0.881096,0.041308,10


In [281]:
# Forgot to remove categorical (dummy)variables that are redundant - mostly 'Nones'
cols_to_drop = ['RoadRail_None', '2Story_N', 'Fence_None', 'PavedDrive_None', 'Electrical_None', \
               'CentralAir_None', 'BsmtExposure_None', 'BsmtQual_None', 'MasVnrType_None']

test = df_train.copy()

In [246]:
path_to_write = '~/Desktop/Ames_ML_Project/Data/train_dummied_df_01_18_8p.csv'
test.to_csv(path_to_write)

In [287]:
# just noticed that NeighType should be a dummy var
# df_train['NeighType'] = df_train.NeighType.astype(str)
# I adjusted for the stored data - to rerun this will have to adjust

In [319]:
path_to_write = '~/Desktop/Ames_ML_Project/Data/final_df_01_18_9p.csv'
test.to_csv(path_to_write)