In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [5]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [6]:
nhds = train.groupby('Neighborhood').median()[['LogSalePrice', 'GrLivArea']]

In [7]:
weights = train.groupby('Neighborhood').count().apply(lambda x: x['PID']/len(train) ,axis=1).to_list()

In [8]:
scaler = StandardScaler()
_ = scaler.fit_transform(nhds)
clusterer = KMeans(n_clusters=2, random_state=42)
cluster_labels = clusterer.fit_predict(_, sample_weight=weights)
nhds['Cluster'] = cluster_labels

In [9]:
cluster_dict = nhds['Cluster'].to_dict()

In [10]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)
test['NhdCluster'] = test.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [11]:
X0_train = train.loc[train['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster', 'YearBuilt'], axis=1)
y0_train = train.loc[train['NhdCluster']==0, 'LogSalePrice']
X0_test = test.loc[test['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster', 'YearBuilt'], axis=1)
y0_test = test.loc[test['NhdCluster']==0, 'LogSalePrice']
X1_train = train.loc[train['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster', 'YearBuilt'], axis=1)
y1_train = train.loc[train['NhdCluster']==1, 'LogSalePrice']
X1_test = test.loc[test['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster', 'YearBuilt'], axis=1)
y1_test = test.loc[test['NhdCluster']==1, 'LogSalePrice']

In [12]:
print(X0_train.shape)
print(X1_train.shape)
print('\n')
print(y0_train.shape)
print(y1_train.shape)
print('\n')
print(X0_test.shape)
print(X1_test.shape)
print('\n')
print(y0_test.shape)
print(y1_test.shape)

(896, 76)
(975, 76)


(896,)
(975,)


(298, 76)
(326, 76)


(298,)
(326,)


In [13]:
categorical = train.select_dtypes(['object','bool']).columns.to_list() + ['MSSubClass']

## Lasso for selection.

In [14]:
def Lasso_select(X, y, alpha):

    pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler(with_mean=False))
        ]
    )
    
    X = pipe.fit_transform(X)

    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    cross = cross_val_score(Lasso(alpha=alpha, max_iter=10000), X, y, scoring='r2', cv=cv, n_jobs=-1)
    
    selector = SelectFromModel(Lasso(alpha=alpha, max_iter=10000))
    selector.fit(X,y)
    num_features = np.sum(selector.get_support())
    
    return cross, num_features

In [15]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X0_train,y0_train,alpha))

(array([0.91456043, 0.85695358, 0.85807852, 0.87017137, 0.90303759]), 188)
(array([0.92086387, 0.88282697, 0.88824016, 0.88109821, 0.90630287]), 181)
(array([0.93005267, 0.9223253 , 0.9396218 , 0.91396418, 0.9193932 ]), 125)
(array([0.89515744, 0.91965012, 0.91388371, 0.88961493, 0.88128992]), 40)
(array([0.56186356, 0.60762023, 0.6050676 , 0.63656076, 0.53320597]), 4)


In [16]:
for alpha in [0.004, 0.005, 0.006, 0.007, 0.008]:
    print(Lasso_select(X0_train,y0_train,alpha))

(array([0.91971845, 0.93209612, 0.93511749, 0.90570819, 0.91007141]), 77)
(array([0.91485651, 0.93117017, 0.93224605, 0.9032566 , 0.9062158 ]), 67)
(array([0.91099221, 0.92999069, 0.92907724, 0.90102105, 0.90209648]), 57)
(array([0.90712753, 0.92836059, 0.9257425 , 0.89812176, 0.89791104]), 53)
(array([0.9036009 , 0.92606709, 0.92194752, 0.89561763, 0.89283942]), 49)


In [17]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X1_train,y1_train,alpha))

  model = cd_fast.enet_coordinate_descent(


(array([0.82165402, 0.79833136, 0.81808475, 0.82150887, 0.86726067]), 204)
(array([0.8419849 , 0.82084351, 0.82251395, 0.82682604, 0.86896851]), 196)
(array([0.84202444, 0.85919383, 0.83041861, 0.85122252, 0.88224071]), 149)
(array([0.80627468, 0.85006077, 0.7817851 , 0.84684516, 0.8613164 ]), 45)
(array([0.38089434, 0.43264064, 0.38766252, 0.40955108, 0.41074173]), 3)


In [18]:
for alpha in [0.004, 0.005, 0.006, 0.007, 0.008]:
    print(Lasso_select(X0_train,y0_train,alpha))

(array([0.91971845, 0.93209612, 0.93511749, 0.90570819, 0.91007141]), 77)
(array([0.91485651, 0.93117017, 0.93224605, 0.9032566 , 0.9062158 ]), 67)
(array([0.91099221, 0.92999069, 0.92907724, 0.90102105, 0.90209648]), 57)
(array([0.90712753, 0.92836059, 0.9257425 , 0.89812176, 0.89791104]), 53)
(array([0.9036009 , 0.92606709, 0.92194752, 0.89561763, 0.89283942]), 49)


## Will try working with 67 and 78 (one-hot encoded) features respectively.

## Ridge for robustness.

In [19]:
X = X0_train
y = y0_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.006, max_iter=10000))),
                 ('ridge', Ridge())])


param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'ridge__alpha': 1}
0.9161875613360577


In [20]:
X = X0_train
y = y0_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.006, max_iter=10000))),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':np.linspace(0.1, 10, 1000)}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'ridge__alpha': 1.338738738738739}
0.9161882335073619


## Trying Lasso on Cluster 1

In [21]:
X = X1_train
y = y1_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.008, max_iter=10000))),
                 ('lasso', Lasso())])


param_grid = {'lasso__alpha':[0.001, 0.1, 1, 10]}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'lasso__alpha': 0.001}
0.8499852266119621


In [22]:
X = X1_train
y = y1_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.008, max_iter=10000))),
                 ('lasso', Lasso())])

param_grid = {'lasso__alpha':np.linspace(0.001, 0.1, 100)}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'lasso__alpha': 0.002}
0.8502666856256418


## Now putting it all together.

In [23]:
pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.006, max_iter=10000))),
                 ('ridge', Ridge(alpha=1.33))])

pipe.fit(X0_train, y0_train)

cluster0_train_predict = pipe.predict(X0_train)
cluster0_test_predict = pipe.predict(X0_test)

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.008, max_iter=10000))),
                 ('lasso', Lasso(alpha=0.002))])

pipe.fit(X1_train, y1_train)

cluster1_train_predict = pipe.predict(X1_train)
cluster1_test_predict = pipe.predict(X1_test)

In [24]:
cluster0_train_predict = pd.DataFrame(cluster0_train_predict).rename(columns={0:'prediction'})
cluster1_train_predict = pd.DataFrame(cluster1_train_predict).rename(columns={0:'prediction'})
cluster0_test_predict = pd.DataFrame(cluster0_test_predict).rename(columns={0:'prediction'})
cluster1_test_predict = pd.DataFrame(cluster1_test_predict).rename(columns={0:'prediction'})

train_predict = pd.concat([cluster0_train_predict, cluster1_train_predict])
test_predict = pd.concat([cluster0_test_predict, cluster1_test_predict])
train_target = pd.concat([y0_train, y1_train])
test_target = pd.concat([y0_test, y1_test])

In [25]:
print(len(train_predict))
print(len(test_predict))
print(len(train_target))
print(len(test_target))

1871
624
1871
624


In [26]:
print(f'Train score is {r2_score(train_target, train_predict)}')
print(f'Test score is {r2_score(test_target, test_predict)}')

Train score is 0.9500854728304114
Test score is 0.9128517226365968


In [27]:
print(f'Cluster 0 train score is {r2_score(y0_train, cluster0_train_predict)}')
print(f'Cluster 0 test score is {r2_score(y0_test, cluster0_test_predict)}')
print('\n')
print(f'Cluster 1 train score is {r2_score(y1_train, cluster1_train_predict)}')
print(f'Cluster 1 test score is {r2_score(y1_test, cluster1_test_predict)}')

Cluster 0 train score is 0.9447537600032996
Cluster 0 test score is 0.9276351864439013


Cluster 1 train score is 0.8818329225652296
Cluster 1 test score is 0.752404436348818


In [28]:
def RSS(y_true, y_predict):
    y_true = np.array(y_true)
    y_predict = np.array(y_predict)
    return np.sum((y_true - y_predict)**2)

In [29]:
def TSS(y_true):
    y_true = np.array(y_true)
    return np.sum((y_true - np.mean(y_true))**2)

In [30]:
print(RSS(train_target, train_predict))
print(TSS(train_target))
print(RSS(test_target, test_predict))
print(TSS(test_target))
print(RSS(y0_train, cluster0_train_predict))
print(TSS(y0_train))
print(RSS(y1_train, cluster1_train_predict))
print(TSS(y1_train))
print(RSS(y0_test, cluster0_test_predict))
print(TSS(y0_test))
print(RSS(y1_test, cluster1_test_predict))
print(TSS(y1_test))

989221.4501513942
272.6363658517055
108153.47786246188
88.77449859589736
149602.0224633217
85.91293539289033
134835.50123301518
74.99676589766047
15219.931281359679
26.09529960518662
14481.062692645557
23.619821977309215


## Take a closer look at cluster features.

In [31]:
pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler())
        ]
    )
    
X = pipe.fit_transform(X0_train)
y = y0_train

selector = SelectFromModel(Lasso(alpha=0.006, max_iter=10000))
selector.fit(X,y)
mask0 = selector.get_support()
feat_names = pipe.named_steps['transformer'].get_feature_names()
names0 = [name for name, boo in zip(feat_names, mask0) if boo]
names0

['Cat__x0_RH',
 'Cat__x3_HLS',
 'Cat__x5_CulDSac',
 'Cat__x5_FR2',
 'Cat__x6_Mod',
 'Cat__x7_CollgCr',
 'Cat__x7_GrnHill',
 'Cat__x7_NWAmes',
 'Cat__x7_NoRidge',
 'Cat__x7_NridgHt',
 'Cat__x7_SawyerW',
 'Cat__x7_Somerst',
 'Cat__x7_StoneBr',
 'Cat__x7_Timber',
 'Cat__x8_Feedr',
 'Cat__x8_Norm',
 'Cat__x10_1Fam',
 'Cat__x10_Twnhs',
 'Cat__x12_Flat',
 'Cat__x14_BrkFace',
 'Cat__x14_HdBoard',
 'Cat__x15_HdBoard',
 'Cat__x15_Wd Shng',
 'Cat__x17_PConc',
 'Cat__x21_N',
 'Cat__x24_Basment',
 'Cat__x27_New',
 'Cat__x29_30',
 'Cat__x29_60',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageCars',
 'GarageArea',
 'PavedDrive',
 'OpenPorchSF',
 'EnclosedPorch',
 'ScreenPorch',
 'PoolQC']

In [32]:
pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler())
        ]
    )
    
X = pipe.fit_transform(X1_train)
y = y1_train

selector = SelectFromModel(Lasso(alpha=0.006, max_iter=10000))
selector.fit(X,y)
mask1 = selector.get_support()
feat_names = pipe.named_steps['transformer'].get_feature_names()
names1 = [name for name, boo in zip(feat_names, mask1) if boo]
names1

['Cat__x0_C (all)',
 'Cat__x0_RL',
 'Cat__x2_IR2',
 'Cat__x2_IR3',
 'Cat__x3_Lvl',
 'Cat__x7_BrkSide',
 'Cat__x7_Edwards',
 'Cat__x7_MeadowV',
 'Cat__x7_Mitchel',
 'Cat__x7_OldTown',
 'Cat__x8_Norm',
 'Cat__x10_1Fam',
 'Cat__x10_Twnhs',
 'Cat__x13_WdShngl',
 'Cat__x14_AsbShng',
 'Cat__x14_BrkFace',
 'Cat__x14_PreCast',
 'Cat__x15_CBlock',
 'Cat__x15_PreCast',
 'Cat__x15_VinylSd',
 'Cat__x17_BrkTil',
 'Cat__x17_PConc',
 'Cat__x18_ALQ',
 'Cat__x18_GLQ',
 'Cat__x18_LwQ',
 'Cat__x18_Unf',
 'Cat__x21_N',
 'Cat__x23_Maj2',
 'Cat__x23_Typ',
 'Cat__x24_Attchd',
 'Cat__x27_Con',
 'Cat__x28_Normal',
 'Cat__x29_30',
 'Cat__x29_160',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'ExterQual',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'Paved

In [33]:
[name for name in names0 if name not in names1]

['Cat__x0_RH',
 'Cat__x3_HLS',
 'Cat__x5_CulDSac',
 'Cat__x5_FR2',
 'Cat__x6_Mod',
 'Cat__x7_CollgCr',
 'Cat__x7_GrnHill',
 'Cat__x7_NWAmes',
 'Cat__x7_NoRidge',
 'Cat__x7_NridgHt',
 'Cat__x7_SawyerW',
 'Cat__x7_Somerst',
 'Cat__x7_StoneBr',
 'Cat__x7_Timber',
 'Cat__x8_Feedr',
 'Cat__x12_Flat',
 'Cat__x14_HdBoard',
 'Cat__x15_HdBoard',
 'Cat__x15_Wd Shng',
 'Cat__x24_Basment',
 'Cat__x27_New',
 'Cat__x29_60',
 'LotFrontage',
 'MasVnrArea',
 'ExterCond',
 'OpenPorchSF',
 'EnclosedPorch',
 'PoolQC']

In [34]:
[name for name in names1 if name not in names0]

['Cat__x0_C (all)',
 'Cat__x0_RL',
 'Cat__x2_IR2',
 'Cat__x2_IR3',
 'Cat__x3_Lvl',
 'Cat__x7_BrkSide',
 'Cat__x7_Edwards',
 'Cat__x7_MeadowV',
 'Cat__x7_Mitchel',
 'Cat__x7_OldTown',
 'Cat__x13_WdShngl',
 'Cat__x14_AsbShng',
 'Cat__x14_PreCast',
 'Cat__x15_CBlock',
 'Cat__x15_PreCast',
 'Cat__x15_VinylSd',
 'Cat__x17_BrkTil',
 'Cat__x18_ALQ',
 'Cat__x18_GLQ',
 'Cat__x18_LwQ',
 'Cat__x18_Unf',
 'Cat__x23_Maj2',
 'Cat__x23_Typ',
 'Cat__x24_Attchd',
 'Cat__x27_Con',
 'Cat__x28_Normal',
 'Cat__x29_160',
 'LowQualFinSF',
 'FullBath',
 'BedroomAbvGr',
 'GarageFinish',
 'WoodDeckSF',
 '3SsnPorch']