In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [5]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [6]:
nhds = train.groupby('Neighborhood').median()[['LogSalePrice', 'GrLivArea']]

In [7]:
weights = train.groupby('Neighborhood').count().apply(lambda x: x['PID']/len(train) ,axis=1).to_list()

In [8]:
scaler = StandardScaler()
_ = scaler.fit_transform(nhds)
clusterer = KMeans(n_clusters=2, random_state=42)
cluster_labels = clusterer.fit_predict(_, sample_weight=weights)
nhds['Cluster'] = cluster_labels

In [9]:
cluster_dict = nhds['Cluster'].to_dict()

In [10]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)
test['NhdCluster'] = test.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [11]:
X0_train = train.loc[train['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y0_train = train.loc[train['NhdCluster']==0, 'LogSalePrice']
X0_test = test.loc[test['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y0_test = test.loc[test['NhdCluster']==0, 'LogSalePrice']
X1_train = train.loc[train['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y1_train = train.loc[train['NhdCluster']==1, 'LogSalePrice']
X1_test = test.loc[test['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y1_test = test.loc[test['NhdCluster']==1, 'LogSalePrice']

In [12]:
print(X0_train.shape)
print(X1_train.shape)
print('\n')
print(y0_train.shape)
print(y1_train.shape)
print('\n')
print(X0_test.shape)
print(X1_test.shape)
print('\n')
print(y0_test.shape)
print(y1_test.shape)

(896, 77)
(975, 77)


(896,)
(975,)


(298, 77)
(326, 77)


(298,)
(326,)


In [13]:
categorical = train.select_dtypes(['object','bool']).columns.to_list() + ['MSSubClass']

## Lasso for selection.

In [14]:
def Lasso_select(X, y, alpha):

    pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler(with_mean=False))
        ]
    )
    
    X = pipe.fit_transform(X)

    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    cross = cross_val_score(Lasso(alpha=alpha, max_iter=10000), X, y, scoring='r2', cv=cv, n_jobs=-1)
    
    selector = SelectFromModel(Lasso(alpha=alpha, max_iter=10000))
    selector.fit(X,y)
    num_features = np.sum(selector.get_support())
    
    return cross, num_features

In [15]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X0_train,y0_train,alpha))

(array([0.91746146, 0.87232791, 0.87758721, 0.88191031, 0.90808885]), 186)
(array([0.92382067, 0.88849718, 0.90373531, 0.88969178, 0.9104986 ]), 181)
(array([0.93148303, 0.9236936 , 0.9401986 , 0.91534401, 0.92077974]), 128)
(array([0.89515769, 0.91965032, 0.9138828 , 0.88961567, 0.88128994]), 40)
(array([0.56186356, 0.60762023, 0.6050676 , 0.63656076, 0.53320597]), 4)


In [16]:
for alpha in [0.004, 0.005, 0.006, 0.007, 0.008]:
    print(Lasso_select(X0_train,y0_train,alpha))

(array([0.91971913, 0.93209643, 0.93525994, 0.90570727, 0.91007427]), 78)
(array([0.91485736, 0.93116944, 0.93224535, 0.90325536, 0.90621718]), 67)
(array([0.91099258, 0.92999009, 0.9290773 , 0.90102034, 0.90209785]), 57)
(array([0.90712796, 0.92836031, 0.92574165, 0.89812118, 0.89791268]), 53)
(array([0.90360156, 0.92606697, 0.92194576, 0.8956178 , 0.89283903]), 49)


In [17]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X1_train,y1_train,alpha))

(array([0.8413848 , 0.79027596, 0.81367875, 0.82652539, 0.87304321]), 208)
(array([0.84624072, 0.81674294, 0.82041365, 0.83061305, 0.87453197]), 197)
(array([0.84698813, 0.85629204, 0.83963723, 0.8526459 , 0.88864524]), 150)
(array([0.80954862, 0.85234601, 0.78821007, 0.85062299, 0.86319727]), 44)
(array([0.38089434, 0.43264064, 0.38766252, 0.40955108, 0.41074173]), 3)


In [18]:
for alpha in [0.004, 0.005, 0.006, 0.007, 0.008]:
    print(Lasso_select(X0_train,y0_train,alpha))

(array([0.91971913, 0.93209643, 0.93525994, 0.90570727, 0.91007427]), 78)
(array([0.91485736, 0.93116944, 0.93224535, 0.90325536, 0.90621718]), 67)
(array([0.91099258, 0.92999009, 0.9290773 , 0.90102034, 0.90209785]), 57)
(array([0.90712796, 0.92836031, 0.92574165, 0.89812118, 0.89791268]), 53)
(array([0.90360156, 0.92606697, 0.92194576, 0.8956178 , 0.89283903]), 49)


## Will try working with 67 and 78 (one-hot encoded) features respectively.

## Ridge for robustness.

In [19]:
X = X0_train
y = y0_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.006, max_iter=10000))),
                 ('ridge', Ridge())])


param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'ridge__alpha': 1}
0.916187390828459


In [20]:
X = X0_train
y = y0_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.006, max_iter=10000))),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':np.linspace(0.1, 10, 1000)}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'ridge__alpha': 1.328828828828829}
0.9161880066975602


## Trying Lasso on Cluster 1

In [33]:
X = X1_train
y = y1_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.008, max_iter=10000))),
                 ('lasso', Lasso())])


param_grid = {'lasso__alpha':[0.001, 0.1, 1, 10]}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'lasso__alpha': 0.001}
0.8535325152051705


In [35]:
X = X1_train
y = y1_train

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.008, max_iter=10000))),
                 ('lasso', Lasso())])

param_grid = {'lasso__alpha':np.linspace(0.001, 0.1, 100)}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

{'lasso__alpha': 0.002}
0.8538154501322366


## Now putting it all together.

In [36]:
pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.006, max_iter=10000))),
                 ('ridge', Ridge(alpha=1.33))])

pipe.fit(X0_train, y0_train)

cluster0_train_predict = pipe.predict(X0_train)
cluster0_test_predict = pipe.predict(X0_test)

pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('selector', SelectFromModel(Lasso(alpha=0.008, max_iter=10000))),
                 ('lasso', Lasso(alpha=0.002))])

pipe.fit(X1_train, y1_train)

cluster1_train_predict = pipe.predict(X1_train)
cluster1_test_predict = pipe.predict(X1_test)

In [37]:
cluster0_train_predict = pd.DataFrame(cluster0_train_predict).rename(columns={0:'prediction'})
cluster1_train_predict = pd.DataFrame(cluster1_train_predict).rename(columns={0:'prediction'})
cluster0_test_predict = pd.DataFrame(cluster0_test_predict).rename(columns={0:'prediction'})
cluster1_test_predict = pd.DataFrame(cluster1_test_predict).rename(columns={0:'prediction'})

train_predict = pd.concat([cluster0_train_predict, cluster1_train_predict])
test_predict = pd.concat([cluster0_test_predict, cluster1_test_predict])
train_target = pd.concat([y0_train, y1_train])
test_target = pd.concat([y0_test, y1_test])

In [38]:
print(len(train_predict))
print(len(test_predict))
print(len(train_target))
print(len(test_target))

1871
624
1871
624


In [39]:
print(f'Train score is {r2_score(train_target, train_predict)}')
print(f'Test score is {r2_score(test_target, test_predict)}')

Train score is 0.951445140351519
Test score is 0.9139302135467522


In [40]:
print(f'Cluster 0 train score is {r2_score(y0_train, cluster0_train_predict)}')
print(f'Cluster 0 test score is {r2_score(y0_test, cluster0_test_predict)}')
print('\n')
print(f'Cluster 1 train score is {r2_score(y1_train, cluster1_train_predict)}')
print(f'Cluster 1 test score is {r2_score(y1_test, cluster1_test_predict)}')

Cluster 0 train score is 0.9447537600032996
Cluster 0 test score is 0.9276351864439013


Cluster 1 train score is 0.8867757331955866
Cluster 1 test score is 0.7564579169331025


In [28]:
def RSS(y_true, y_predict):
    y_true = np.array(y_true)
    y_predict = np.array(y_predict)
    return np.sum((y_true - y_predict)**2)

In [29]:
def TSS(y_true):
    y_true = np.array(y_true)
    return np.sum((y_true - np.mean(y_true))**2)

In [30]:
print(RSS(train_target, train_predict))
print(TSS(train_target))
print(RSS(test_target, test_predict))
print(TSS(test_target))
print(RSS(y0_train, cluster0_train_predict))
print(TSS(y0_train))
print(RSS(y1_train, cluster1_train_predict))
print(TSS(y1_train))
print(RSS(y0_test, cluster0_test_predict))
print(TSS(y0_test))
print(RSS(y1_test, cluster1_test_predict))
print(TSS(y1_test))

995299.9938631437
272.6363658517055
109054.76960286847
88.77449859589736
149602.0224633217
85.91293539289033
138003.10151038252
74.99676589766047
15219.931281359679
26.09529960518662
14955.100558760892
23.619821977309215


## Take a closer look at cluster features.

In [14]:
pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler())
        ]
    )
    
X = pipe.fit_transform(X0_train)
y = y0_train

selector = SelectFromModel(Lasso(alpha=0.006, max_iter=10000))
selector.fit(X,y)
mask0 = selector.get_support()
feat_names = pipe.named_steps['transformer'].get_feature_names()
names0 = [name for name, boo in zip(feat_names, mask0) if boo]
names0

['Cat__x0_RH',
 'Cat__x3_HLS',
 'Cat__x5_CulDSac',
 'Cat__x5_FR2',
 'Cat__x6_Mod',
 'Cat__x7_CollgCr',
 'Cat__x7_GrnHill',
 'Cat__x7_NWAmes',
 'Cat__x7_NoRidge',
 'Cat__x7_NridgHt',
 'Cat__x7_SawyerW',
 'Cat__x7_Somerst',
 'Cat__x7_StoneBr',
 'Cat__x7_Timber',
 'Cat__x8_Feedr',
 'Cat__x8_Norm',
 'Cat__x10_1Fam',
 'Cat__x10_Twnhs',
 'Cat__x12_Flat',
 'Cat__x14_BrkFace',
 'Cat__x14_HdBoard',
 'Cat__x15_HdBoard',
 'Cat__x15_Wd Shng',
 'Cat__x17_PConc',
 'Cat__x21_N',
 'Cat__x24_Basment',
 'Cat__x27_New',
 'Cat__x29_30',
 'Cat__x29_60',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageCars',
 'GarageArea',
 'PavedDrive',
 'OpenPorchSF',
 'EnclosedPorch',
 'ScreenPorch',
 'PoolQC']

In [19]:
pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler())
        ]
    )
    
X = pipe.fit_transform(X1_train)
y = y1_train

selector = SelectFromModel(Lasso(alpha=0.006, max_iter=10000))
selector.fit(X,y)
mask1 = selector.get_support()
feat_names = pipe.named_steps['transformer'].get_feature_names()
names1 = [name for name, boo in zip(feat_names, mask1) if boo]
names1

['Cat__x0_C (all)',
 'Cat__x0_RL',
 'Cat__x2_IR2',
 'Cat__x3_Lvl',
 'Cat__x7_BrkSide',
 'Cat__x7_Edwards',
 'Cat__x7_Greens',
 'Cat__x7_MeadowV',
 'Cat__x7_Mitchel',
 'Cat__x8_Norm',
 'Cat__x10_1Fam',
 'Cat__x10_Twnhs',
 'Cat__x13_WdShngl',
 'Cat__x14_AsbShng',
 'Cat__x14_BrkFace',
 'Cat__x14_PreCast',
 'Cat__x15_AsbShng',
 'Cat__x15_CBlock',
 'Cat__x15_PreCast',
 'Cat__x17_BrkTil',
 'Cat__x17_PConc',
 'Cat__x18_ALQ',
 'Cat__x18_GLQ',
 'Cat__x18_LwQ',
 'Cat__x18_Unf',
 'Cat__x21_N',
 'Cat__x23_Maj2',
 'Cat__x23_Typ',
 'Cat__x24_Attchd',
 'Cat__x27_Con',
 'Cat__x28_Normal',
 'Cat__x29_30',
 'Cat__x29_160',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageFinish',
 'GarageCars',
 'GarageArea

In [20]:
[name for name in names0 if name not in names1]

['Cat__x0_RH',
 'Cat__x3_HLS',
 'Cat__x5_CulDSac',
 'Cat__x5_FR2',
 'Cat__x6_Mod',
 'Cat__x7_CollgCr',
 'Cat__x7_GrnHill',
 'Cat__x7_NWAmes',
 'Cat__x7_NoRidge',
 'Cat__x7_NridgHt',
 'Cat__x7_SawyerW',
 'Cat__x7_Somerst',
 'Cat__x7_StoneBr',
 'Cat__x7_Timber',
 'Cat__x8_Feedr',
 'Cat__x12_Flat',
 'Cat__x14_HdBoard',
 'Cat__x15_HdBoard',
 'Cat__x15_Wd Shng',
 'Cat__x24_Basment',
 'Cat__x27_New',
 'Cat__x29_60',
 'LotFrontage',
 'MasVnrArea',
 'ExterCond',
 'OpenPorchSF',
 'EnclosedPorch',
 'PoolQC']

In [21]:
[name for name in names1 if name not in names0]

['Cat__x0_C (all)',
 'Cat__x0_RL',
 'Cat__x2_IR2',
 'Cat__x3_Lvl',
 'Cat__x7_BrkSide',
 'Cat__x7_Edwards',
 'Cat__x7_Greens',
 'Cat__x7_MeadowV',
 'Cat__x7_Mitchel',
 'Cat__x13_WdShngl',
 'Cat__x14_AsbShng',
 'Cat__x14_PreCast',
 'Cat__x15_AsbShng',
 'Cat__x15_CBlock',
 'Cat__x15_PreCast',
 'Cat__x17_BrkTil',
 'Cat__x18_ALQ',
 'Cat__x18_GLQ',
 'Cat__x18_LwQ',
 'Cat__x18_Unf',
 'Cat__x23_Maj2',
 'Cat__x23_Typ',
 'Cat__x24_Attchd',
 'Cat__x27_Con',
 'Cat__x28_Normal',
 'Cat__x29_160',
 'YearBuilt',
 'BsmtCond',
 'LowQualFinSF',
 'FullBath',
 'BedroomAbvGr',
 'GarageFinish',
 'WoodDeckSF']