# Clustering + Random Forests

In [54]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [55]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [56]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [57]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [58]:
train['LogSalePrice'] = np.log(train['SalePrice'])

In [59]:
test['LogSalePrice'] = np.log(test['SalePrice'])

In [60]:
nhds = train.loc[:,['Neighborhood', 'LogSalePrice', 'GrLivArea']]

In [61]:
nhds = nhds.groupby('Neighborhood').agg(
    Sqft_med=pd.NamedAgg('GrLivArea',np.median),
    LogPrice_med=pd.NamedAgg('LogSalePrice',np.median)
).fillna(0)

In [62]:
weights = train.groupby('Neighborhood').count().apply(lambda x: x['PID']/len(train) ,axis=1).to_list()

In [63]:
scaler = StandardScaler()
_ = scaler.fit_transform(nhds)
clusterer = KMeans(n_clusters=2, random_state=42)
cluster_labels = clusterer.fit_predict(_, sample_weight=weights)
nhds['Cluster'] = cluster_labels

In [64]:
cluster_dict = nhds['Cluster'].to_dict()

In [65]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [66]:
test['NhdCluster'] = test.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [67]:
comp_dict = train.groupby(['Neighborhood', 'BedroomAbvGr', 'BldgType',
               'OverallQual', 'FullBath', 'KitchenQual', 'GarageCars']).mean()['LogSalePrice'].to_dict()

In [68]:
train['Comp'] = train.apply(lambda x: comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])], axis=1)

In [69]:
alt_dict = train.groupby('Neighborhood').mean()['LogSalePrice'].to_dict()

In [70]:
def test_comp(x):
    if (x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars']) in comp_dict.keys():
        return comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])]
    else:
        return alt_dict[x['Neighborhood']]    

In [71]:
test['Comp'] = test.apply(lambda x: test_comp(x), axis=1)

In [72]:
X0_train = train.loc[train['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y0_train = train.loc[train['NhdCluster']==0, 'LogSalePrice']
X0_test = test.loc[test['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y0_test = test.loc[test['NhdCluster']==0, 'LogSalePrice']

X1_train = train.loc[train['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y1_train = train.loc[train['NhdCluster']==1, 'LogSalePrice']
X1_test = test.loc[test['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y1_test = test.loc[test['NhdCluster']==1, 'LogSalePrice']

In [73]:
print(X0_train.shape)
print(X1_train.shape)
print('\n')
print(y0_train.shape)
print(y1_train.shape)
print('\n')
print(X0_test.shape)
print(X1_test.shape)
print('\n')
print(y0_test.shape)
print(y1_test.shape)

(896, 78)
(975, 78)


(896,)
(975,)


(298, 78)
(326, 78)


(298,)
(326,)


# Preprocessing

In [74]:
cat_feats = X0_train.select_dtypes(['object']).columns.to_list()
num_feats = X0_train.select_dtypes(['int','float']).columns.to_list()

In [75]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('tf1',OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_feats)],remainder='passthrough')
     #('tf1',OneHotEncoder(sparse=False, drop='first'), cat_feats)],remainder='passthrough')


In [None]:
#Cluster 0 preprocessing
X0_train_transformed = preprocessor.fit_transform(X0_train)    

In [76]:
columns_transformed = preprocessor.named_transformers_['tf1'].get_feature_names(input_features= cat_feats)
new_columns = list(columns_transformed)+num_feats

In [77]:
X0_train = pd.DataFrame(X0_train_transformed,columns=new_columns)

In [78]:
X0_test_transformed = preprocessor.transform(X0_test)
X0_test = pd.DataFrame(X0_test_transformed,columns=new_columns)

In [80]:
X0_train.shape

(896, 208)

In [83]:
#Cluster 1 preprocessing
X1_train_transformed = preprocessor.fit_transform(X1_train)
columns_transformed = preprocessor.named_transformers_['tf1'].get_feature_names(input_features= cat_feats)
new_columns = list(columns_transformed)+num_feats

In [84]:
X1_train = pd.DataFrame(X1_train_transformed,columns=new_columns)

In [86]:
X1_train.shape

(975, 225)

In [87]:
X1_test_transformed = preprocessor.transform(X1_test)
X1_test = pd.DataFrame(X1_test_transformed,columns=new_columns)

In [88]:
X1_test.shape

(326, 225)

# No Tuning 

In [122]:
#CLUSTER 0 RF 
from sklearn.ensemble import RandomForestRegressor
rf_c0 = RandomForestRegressor(random_state=0)
rf_c0.fit(X0_train,y0_train)
print('Cross Val score: ', cross_val_score(rf_c0, X0_train, y0_train, cv=3))
print('Cross Val score mean: ', cross_val_score(rf_c0, X0_train, y0_train, cv=3).mean())
print('Train score: ',rf_c0.score(X0_train,y0_train))
print('Test score: ',rf_c0.score(X0_test,y0_test))

Cross Val score:  [0.91135597 0.90686604 0.91201843]
Cross Val score mean:  0.9100801483304278
Train score:  0.9886767465140318
Test score:  0.7038484492433126


In [124]:
#CLUSTER 1 RF 
from sklearn.ensemble import RandomForestRegressor
rf_c1 = RandomForestRegressor(random_state=0)
rf_c1.fit(X1_train,y1_train)
print('Cross Val score: ', cross_val_score(rf_c1, X1_train, y1_train, cv=3))
print('Cross Val score mean: ', cross_val_score(rf_c1, X1_train, y1_train, cv=3).mean())
print('Train score: ',rf_c1.score(X1_train,y1_train))
print('Test score: ',rf_c1.score(X1_test,y1_test))

Cross Val score:  [0.89529721 0.88283896 0.91964325]
Cross Val score mean:  0.8992598054996055
Train score:  0.9871021983272925
Test score:  0.43275485814176873


# No tuning (using optimal parameters from yesterday (on full data set))

In [128]:
#CLUSTER 0 RF 
from sklearn.ensemble import RandomForestRegressor
randomForest_c0 = RandomForestRegressor(n_estimators=400, 
                                        min_samples_leaf = 1,
                                        min_samples_split=5,
                                        max_features=100,
                                        max_depth = 40,
                                        bootstrap=False,
                                        random_state=0)
randomForest_c0.fit(X0_train,y0_train)

RandomForestRegressor(bootstrap=False, max_depth=40, max_features=100,
                      min_samples_split=5, n_estimators=400, random_state=0)

In [129]:
print('Cross Val score: ', cross_val_score(randomForest_c0, X0_train, y0_train, cv=3))
print('Cross Val score mean: ', cross_val_score(randomForest_c0, X0_train, y0_train, cv=3).mean())
print('Train score: ',randomForest_c0.score(X0_train,y0_train))
print('Test score: ',randomForest_c0.score(X0_test,y0_test))

Cross Val score:  [0.91507643 0.91009053 0.909882  ]
Cross Val score mean:  0.9116829847745906
Train score:  0.9987439533469034
Test score:  0.7649293759157838


In [130]:
#CLUSTER 1 RF 
randomForest_c1 = RandomForestRegressor(n_estimators=400, 
                                        min_samples_leaf = 1,
                                        min_samples_split=5,
                                        max_features=100,
                                        max_depth = 40,
                                        bootstrap=False,
                                        random_state=0)
randomForest_c1.fit(X1_train,y1_train)

RandomForestRegressor(bootstrap=False, max_depth=40, max_features=100,
                      min_samples_split=5, n_estimators=400, random_state=0)

In [131]:
print('Cross Val score: ', cross_val_score(randomForest_c1, X1_train, y1_train, cv=3))
print('Cross Val score mean: ', cross_val_score(randomForest_c1, X1_train, y1_train, cv=3).mean())
print('Train score: ',randomForest_c1.score(X1_train,y1_train))
print('Test score: ',randomForest_c1.score(X1_test,y1_test))

Cross Val score:  [0.89704976 0.88281301 0.92022543]
Cross Val score mean:  0.9000294008049253
Train score:  0.9982284373505669
Test score:  0.5072849950882926


# No clusters - All data (note the test score is lower than yesterday's results)

In [150]:
X_train_all = train.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'NhdCluster'], axis=1)
X_test_all =test.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'NhdCluster'], axis=1)

y_train_all = train['LogSalePrice']
y_test_all = test['LogSalePrice']

In [151]:
cat_feats = X_train_all.select_dtypes(['object']).columns.to_list()
num_feats = X_train_all.select_dtypes(['int','float']).columns.to_list()

In [152]:
preprocessor = ColumnTransformer(transformers=[
    ('tf1',OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_feats)],remainder='passthrough')

In [153]:
X_train_all = preprocessor.fit_transform(X_train_all)    


In [154]:
columns_transformed = preprocessor.named_transformers_['tf1'].get_feature_names(input_features= cat_feats)
new_columns = list(columns_transformed)+num_feats

In [155]:
X_train_all = pd.DataFrame(X_train_all,columns=new_columns)
X_test_all = preprocessor.transform(X_test_all)
X_test_all = pd.DataFrame(X_test_all,columns=new_columns)

In [160]:
#old hyperparameters
randomForest_all = RandomForestRegressor(n_estimators=400, 
                                        min_samples_leaf = 1,
                                        min_samples_split=5,
                                        max_features=100,
                                        max_depth = 40,
                                        bootstrap=False,
                                        random_state=0)
randomForest_all.fit(X_train_all,y_train_all)

RandomForestRegressor(bootstrap=False, max_depth=40, max_features=100,
                      min_samples_split=5, n_estimators=400, random_state=0)

In [161]:
print('Cross Val score: ', cross_val_score(randomForest_all, X_train_all, y_train_all, cv=3))
print('Cross Val score mean: ', cross_val_score(randomForest_all, X_train_all, y_train_all, cv=3).mean())
print('Train score: ',randomForest_all.score(X_train_all,y_train_all))
print('Test score: ',randomForest_all.score(X_test_all,y_test_all))

Cross Val score:  [0.95391799 0.94991247 0.95734066]
Cross Val score mean:  0.9537237055917892
Train score:  0.9993544122255908
Test score:  0.8220244146227278


In [183]:
#tuning hyperparameters again 
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 300, 
                               cv = 3,
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_all, y_train_all)


Fitting 3 folds for each of 300 candidates, totalling 900 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [20, 40, 60, 80, 100,
                                                      None],
                                        'max_features': ['auto', 'sqrt', 10, 50,
                                                         100, 150],
                                        'min_samples_leaf': [1, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 400, 600,
                                                         1000]},
                   random_state=42, verbose=2)

In [184]:
best_rf = rf_random.best_estimator_
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 150,
 'max_depth': 40,
 'bootstrap': True}

In [None]:
# {'n_estimators': 400,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1,
#  'max_features': 150,
#  'max_depth': 40,
#  'bootstrap': True}

In [185]:
print('Cross Val score: ', cross_val_score(best_rf, X_train_all, y_train_all, cv=3))
print('Cross Val score mean: ', cross_val_score(best_rf, X_train_all, y_train_all, cv=3).mean())
print('Train score: ',best_rf.score(X_train_all,y_train_all))
print('Test score: ',best_rf.score(X_test_all,y_test_all))

Cross Val score:  [0.95454597 0.95029594 0.95802828]
Cross Val score mean:  0.9541598078236002
Train score:  0.9939164170010225
Test score:  0.794824385021973


In [None]:
importances = pd.Series(
    best_rf.feature_importances_,
    index = X1_train.columns
).sort_values(ascending=False)
importances[:25]

# Tuning for each cluster (using RandomizedSearchCV)

In [169]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [100,200,400,600,1000]
# Number of features to consider at every split
max_features = ['auto', 'sqrt',10,50,100,150]
# Maximum number of levels in tree
max_depth = list(range(20,101,20))
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [188]:
random_grid

{'n_estimators': [100, 200, 400, 600, 1000],
 'max_features': ['auto', 'sqrt', 10, 50, 100, 150],
 'max_depth': [20, 40, 60, 80, 100, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 10],
 'bootstrap': [True, False]}

In [189]:
#CLUSTER 0 - Tuning 
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random_0 = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 300, 
                               cv = 3,
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)
# Fit the random search model
rf_random_0.fit(X0_train, y0_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [20, 40, 60, 80, 100,
                                                      None],
                                        'max_features': ['auto', 'sqrt', 10, 50,
                                                         100, 150],
                                        'min_samples_leaf': [1, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 400, 600,
                                                         1000]},
                   random_state=42, verbose=2)

In [190]:
best_rf_0 = rf_random_0.best_estimator_
rf_random_0.best_params_


{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 100,
 'max_depth': 60,
 'bootstrap': True}

In [None]:
# {'n_estimators': 400,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1,
#  'max_features': 100,
#  'max_depth': 60,
#  'bootstrap': True}

In [191]:
# CLUSTER 0 RF Tuned 
print('Cross Val score: ', cross_val_score(best_rf_0, X0_train, y0_train, cv=3))
print('Cross Val score mean: ', cross_val_score(best_rf_0, X0_train, y0_train, cv=3).mean())
print('Train score: ',best_rf_0.score(X0_train,y0_train))
print('Test score: ',best_rf_0.score(X0_test,y0_test))

Cross Val score:  [0.92304873 0.90986668 0.90849926]
Cross Val score mean:  0.9130197371034696
Train score:  0.9895515899446137
Test score:  0.770188525880735


In [193]:
importances = pd.Series(
    best_rf_0.feature_importances_,
    index = X0_train.columns
).sort_values(ascending=False)
importances[:25]

Comp            0.568271
OverallQual     0.135660
GarageCars      0.038797
GarageArea      0.038508
1stFlrSF        0.035959
BsmtFinSF1      0.017290
FullBath        0.016184
MasVnrArea      0.015961
LotArea         0.012244
BsmtQual        0.012208
2ndFlrSF        0.012036
TotRmsAbvGrd    0.006915
KitchenQual     0.005216
OpenPorchSF     0.004949
BsmtUnfSF       0.004792
LotFrontage     0.004624
FireplaceQu     0.003523
YearBuilt       0.003365
YearRemodAdd    0.003306
WoodDeckSF      0.003243
MSZoning_RH     0.002849
Fireplaces      0.002700
GarageYrBlt     0.002625
ExterQual       0.002603
MoSold          0.002549
dtype: float64

In [180]:
#CLUSTER 1 - Tuning 
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random_1 = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 300, 
                               cv = 3,
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)
# Fit the random search model
rf_random_1.fit(X1_train, y1_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=300,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [20, 40, 60, 80, 100,
                                                      None],
                                        'max_features': ['auto', 'sqrt', 10, 50,
                                                         100, 150],
                                        'min_samples_leaf': [1, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 400, 600,
                                                         1000]},
                   random_state=42, verbose=2)

In [181]:
best_rf_1 = rf_random_1.best_estimator_
rf_random_1.best_params_

{'n_estimators': 600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 150,
 'max_depth': None,
 'bootstrap': True}

In [None]:
# {'n_estimators': 600,
#  'min_samples_split': 5,
#  'min_samples_leaf': 1,
#  'max_features': 150,
#  'max_depth': None,
#  'bootstrap': True}

In [182]:
# CLUSTER 1 RF Tuned 
print('Cross Val score: ', cross_val_score(best_rf_1, X1_train, y1_train, cv=3))
print('Cross Val score mean: ', cross_val_score(best_rf_1, X1_train, y1_train, cv=3).mean())
print('Train score: ',best_rf_1.score(X1_train,y1_train))
print('Test score: ',best_rf_1.score(X1_test,y1_test))

Cross Val score:  [0.89904027 0.88949657 0.92366882]
Cross Val score mean:  0.9037284485695353
Train score:  0.9820587344125724
Test score:  0.45283460285413246


In [186]:
importances = pd.Series(
    best_rf_1.feature_importances_,
    index = X1_train.columns
).sort_values(ascending=False)
importances[:25]

Comp             0.738687
OverallQual      0.081096
1stFlrSF         0.046685
GarageArea       0.017748
LotArea          0.008262
BsmtFinSF1       0.005886
OverallCond      0.005084
YearRemodAdd     0.004806
GarageCars       0.004368
2ndFlrSF         0.004291
FireplaceQu      0.003422
GarageYrBlt      0.003349
BsmtUnfSF        0.003339
LotFrontage      0.003205
MoSold           0.003114
TotRmsAbvGrd     0.003099
EnclosedPorch    0.003064
YearBuilt        0.002962
ExterQual        0.002400
WoodDeckSF       0.002211
MasVnrArea       0.002144
PavedDrive       0.001927
GarageCond       0.001810
BsmtCond         0.001746
OpenPorchSF      0.001734
dtype: float64