In [5]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
import helper
style.use('fivethirtyeight')

In [6]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

## Data processing

In [19]:
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory = False)


train_, test_ = helper.data_processing_wrapper(housing)

In [20]:
train = pd.read_csv('train_clean.csv', index_col=0)
test = pd.read_csv('test_clean.csv', index_col=0)

In [21]:
cat_feats_ = train_.select_dtypes(['object','bool']).columns.to_list()
cat_feats_ = cat_feats + ['YearBuilt','KitchenQual','GarageCars', 'BedroomAbvGr']

In [22]:
num_cols_ = train_.select_dtypes(['float64','int64']).columns.to_list()
num_cols_.remove('SalePrice')


In [23]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()
cat_feats = cat_feats + ['YearBuilt','KitchenQual','GarageCars', 'BedroomAbvGr']

In [24]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()
num_cols.remove('SalePrice')
num_cols.remove('PID')

## Simple linear model

In [25]:
def simple_linear_model_score(train, test, cols, target):
    ols = linear_model.LinearRegression()
    X = train[cols]
    cat_feats = X.select_dtypes(['object','bool']).columns.to_list()
    transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
    X = transformer.fit_transform(X)
    y = np.log(train[target])
    
    ols.fit(X, y)
    train_score = ols.score(X, y)
    
    X_tst = test[cols]
    X_tst = transformer.transform(X_tst)
    y_tst = np.log(test[target])
    test_score = ols.score(X_tst, y_tst)
    
    return train_score, test_score

The kitchen sink model of every feature over learns. And has a negative test score

In [26]:
simple_linear_model_score(train, test, cat_feats+num_cols, 'SalePrice')

(0.9577940526767014, -50366487498.91418)

The numerical kitchen sink model is very successful

In [13]:
simple_linear_model_score(train, test, num_cols, 'SalePrice')

(0.9290445015300175, 0.8849212267144508)

In [28]:
simple_linear_model_score(train_, test_, num_cols, 'SalePrice')

(0.9301860929679749, 0.8859787117907402)

The numerical kitchen sink model gets better when we selectively pick our favorite categorical features

In [9]:
fav_cat = ['YearBuilt', 'ExterQual', 'Neighborhood', 'KitchenQual']
simple_linear_model_score(train, test, num_cols+fav_cat, 'SalePrice')

(0.9418781919970982, 0.8991659160993932)

In [29]:
fav_cat = ['YearBuilt', 'ExterQual', 'Neighborhood', 'KitchenQual']
simple_linear_model_score(train_, test_, num_cols+fav_cat, 'SalePrice')

(0.9429838134276152, 0.9000142390219559)

We can see that as we add more and more numerical features our model gets better.

In [10]:
k=100
cols = train.corr().nlargest(k, 'SalePrice')['SalePrice'].index
cols = cols[1:]
for num_features in range(1,len(cols)):
    print(cols[num_features])
    print(simple_linear_model_score(train, test, cols[1:1+num_features], 'SalePrice'))

GrLivArea
(0.5361242929283243, 0.5052486795731859)
ExterQual
(0.7034962809602388, 0.6644905975921633)
TotalBsmtSF
(0.7792598241555013, 0.7229657933399387)
KitchenQual
(0.7974247390402703, 0.7489550650148744)
1stFlrSF
(0.7975069805128663, 0.7497354661616471)
GarageArea
(0.8182544807598195, 0.779205489561503)
GarageCars
(0.8229613212881246, 0.7862240548493213)
BsmtQual
(0.8367316241545875, 0.8093688234853456)
YearBuilt
(0.8496799637760847, 0.8201252110298075)
FullBath
(0.8515427358211811, 0.8194532379214836)
GarageFinish
(0.8546721816101726, 0.8239315804336104)
FireplaceQu
(0.862791589110423, 0.8355690508933211)
MasVnrArea
(0.863014924192381, 0.8357178472479138)
TotRmsAbvGrd
(0.8634271653730479, 0.8341419606401707)
YearRemodAdd
(0.8695869148482882, 0.8386413512424243)
Fireplaces
(0.8735302259811796, 0.8403365999459429)
BsmtFinSF1
(0.8805143918287985, 0.8482424745087166)
HeatingQC
(0.8819825209355477, 0.8512038129217325)
BsmtExposure
(0.8837381955399547, 0.8524812209245907)
LotFrontage
(0

In [30]:
k=100
cols = train_.corr().nlargest(k, 'SalePrice')['SalePrice'].index
cols = cols[1:]
for num_features in range(1,len(cols)):
    print(cols[num_features])
    print(simple_linear_model_score(train_, test_, cols[1:1+num_features], 'SalePrice'))

GrLivArea
(0.5361242929283243, 0.5052486795731859)
ExterQual
(0.7034962809602388, 0.6644905975921633)
TotalBsmtSF
(0.7792598241555013, 0.7229657933399387)
KitchenQual
(0.7974247390402703, 0.7489550650148744)
1stFlrSF
(0.7975069805128663, 0.7497354661616471)
GarageArea
(0.8182544807598195, 0.779205489561503)
BsmtQual
(0.8342641233283596, 0.8060471284296409)
YearBuilt
(0.8489363096053784, 0.8185587963092968)
FullBath
(0.8505082900397859, 0.8176308304541391)
GarageFinish
(0.8542220076535905, 0.8228187288247628)
FireplaceQu
(0.8626195006512688, 0.8350800445975372)
MasVnrArea
(0.8628483702101578, 0.8352495571126983)
TotRmsAbvGrd
(0.8632383769769689, 0.8336224655514265)
YearRemodAdd
(0.8694099127364687, 0.838164057448497)
Fireplaces
(0.8733877848875367, 0.8399358974204559)
BsmtFinSF1
(0.880251013075894, 0.8476901670373542)
HeatingQC
(0.8817333279495017, 0.850659889034522)
BsmtExposure
(0.88347050795087, 0.8519796400713437)
LotFrontage
(0.8863651852994867, 0.8544557162923521)
WoodDeckSF
(0.88

## Problem section

Can do lasso feature selction. Train a linear model with those selected features. But I get a dimension mismatch when trying to score the test dataset. 

In [31]:
scaler = StandardScaler(with_mean=False)

In [32]:
ols = linear_model.LinearRegression()

In [85]:
train_

Unnamed: 0,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1296,90000,90,RL,72.000000,10791,Pave,0,Reg,Lvl,...,0,0,0,,Shed,500,10,2006,WD,Normal
1,1229,137000,160,RM,24.000000,1488,Pave,0,Reg,Lvl,...,0,0,0,GdPrv,,0,10,2009,WD,Normal
2,948,89000,160,RM,21.000000,1680,Pave,0,Reg,Lvl,...,0,0,0,,,0,7,2006,WD,Normal
3,1040,123900,50,RM,52.000000,6240,Pave,0,Reg,Lvl,...,0,0,0,,,0,5,2010,WD,Normal
4,912,156000,85,RL,61.990202,7540,Pave,0,IR1,Lvl,...,192,0,0,MnPrv,,0,6,2007,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1866,1200,151000,160,FV,30.000000,3182,Pave,2,Reg,Lvl,...,0,0,0,,,0,6,2010,WD,Normal
1867,2400,290000,60,RL,96.000000,13262,Pave,0,IR1,Lvl,...,0,0,0,,,0,6,2009,WD,Normal
1868,1346,112000,50,RL,70.000000,11767,Pave,0,Reg,Lvl,...,0,0,0,GdWo,,0,5,2007,WD,Normal
1869,2031,237000,60,RL,75.000000,9473,Pave,0,Reg,Lvl,...,0,0,0,,,0,3,2008,WD,Normal


In [86]:
test_

Unnamed: 0,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1820,184000,50,RL,60.000000,9120,Pave,2,Reg,Lvl,...,0,0,0,GdPrv,,0,6,2008,WD,Normal
1,1296,104000,30,RL,67.000000,4853,Pave,0,Reg,Bnk,...,0,0,0,MnPrv,,0,5,2010,WD,Normal
2,1434,189000,20,RL,65.000000,8529,Pave,0,IR1,Lvl,...,0,0,0,,,0,4,2009,WD,Normal
3,1499,187000,60,RL,65.000000,12438,Pave,0,IR1,Lvl,...,0,0,0,,,0,8,2006,WD,Normal
4,1797,231000,60,RL,74.000000,12961,Pave,0,Reg,Lvl,...,0,0,0,,,0,3,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,1501,244000,120,RL,46.000000,4054,Pave,0,IR1,Lvl,...,0,0,0,,,0,10,2007,WD,Normal
620,2087,187500,60,RL,78.868989,12205,Pave,0,IR1,Low,...,0,0,0,,,0,7,2007,WD,Normal
621,1160,152500,20,RL,95.723848,17979,Pave,0,IR1,Lvl,...,0,0,0,GdWo,Shed,500,2,2008,WD,Normal
622,1865,235000,60,RL,72.509189,10316,Pave,0,IR1,Lvl,...,0,0,0,,,0,6,2008,WD,Normal


In [87]:
cat_feats

['MSSubClass',
 'MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'CentralAir',
 'Electrical',
 'Functional',
 'GarageType',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition',
 'YearBuilt',
 'KitchenQual',
 'GarageCars',
 'BedroomAbvGr']

lasso with all columns 

In [108]:
X = train_.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train_['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)


0.8596901489956692

In [109]:
X_tst = test_.drop(['SalePrice', 'TotalBsmtSF'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_['SalePrice'])
X_tst = selector.transform(X_tst)


In [110]:
ols.score(X_tst,y_tst)

0.8185227162036243

In [112]:
feat_names = transformer.get_feature_names()


mask = selector.get_support()

lasso_feats = [a for a, b in zip(feat_names, mask) if b]
print(lasso_feats)

['GrLivArea', 'OverallQual', 'BsmtQual', '1stFlrSF', 'GarageArea']


lasso remove grlivearea

In [113]:
X = train_.drop(['SalePrice', 'TotalBsmtSF','GrLivArea'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train_['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)

0.8176024805155407

In [114]:
X_tst = test_.drop(['SalePrice', 'TotalBsmtSF','GrLivArea'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_['SalePrice'])
X_tst = selector.transform(X_tst)



In [115]:
ols.score(X_tst,y_tst)

0.779610763366811

In [116]:
feat_names = transformer.get_feature_names()


mask = selector.get_support()

lasso_feats = [a for a, b in zip(feat_names, mask) if b]
print(lasso_feats)

['OverallQual', '1stFlrSF', 'TotRmsAbvGrd', 'GarageArea']


lasso with numerical columns only

In [118]:
X = train_[num_cols_]
X = X.drop(['GrLivArea','TotalBsmtSF'],axis=1)

# transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
# X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train_['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)



0.8279153692167691

In [119]:
X_tst = test_[num_cols_]
X_tst = X_tst.drop(['GrLivArea','TotalBsmtSF'],axis=1)
X_tst = scaler.transform(X_tst)

# X_tst = transformer.transform(X_tst)
y_tst = np.log(test_['SalePrice'])
X_tst = selector.transform(X_tst)

In [120]:
ols.score(X_tst,y_tst)

0.7925941540357762

In [121]:
feat_names = transformer.get_feature_names()


mask = selector.get_support()

lasso_feats = [a for a, b in zip(feat_names, mask) if b]
print(lasso_feats)

['Cat__x0_180', 'Cat__x1_C (all)', 'Cat__x2_Pave', 'Cat__x3_IR1', 'Cat__x4_HLS']


model had slight improvement when GarageCars was float.

In [127]:
train_['GarageCars'] = train_['GarageCars'].astype('float')

In [128]:
test_['GarageCars'] = test_['GarageCars'].astype('float')

In [130]:
cat_feats.remove('GarageCars')

In [131]:
X = train_.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train_['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)



0.8604843360147527

In [132]:
X_tst = test_.drop(['SalePrice', 'TotalBsmtSF'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_['SalePrice'])
X_tst = selector.transform(X_tst)



In [133]:
ols.score(X_tst,y_tst)

0.8197985544116193

We can see that the top 6 lasso features are different from what happens when selecting just the top 6 correlated features. Good job lasso!

In [134]:
feat_names = transformer.get_feature_names()


mask = selector.get_support()

lasso_feats = [a for a, b in zip(feat_names, mask) if b]
print(lasso_feats)

['GrLivArea', 'OverallQual', 'BsmtQual', '1stFlrSF', 'GarageCars', 'GarageArea']


GrLivArea
(0.5361242929283243, 0.5052486795731859)

ExterQual
(0.7034962809602388, 0.6644905975921633)

TotalBsmtSF
(0.7792598241555013, 0.7229657933399387)

KitchenQual
(0.7974247390402703, 0.7489550650148744)

1stFlrSF
(0.7975069805128663, 0.7497354661616471)

GarageArea
(0.8182544807598195, 0.779205489561503)