In [16]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from scipy import stats
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [5]:
train.head()

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,905101330,1296,90000,90,RL,72.0,10791,Pave,0,Reg,...,0,0,0,,Shed,500,10,2006,WD,Normal
1,909451100,1229,137000,160,RM,24.0,1488,Pave,0,Reg,...,0,0,0,GdPrv,,0,10,2009,WD,Normal
2,527451450,948,89000,160,RM,21.0,1680,Pave,0,Reg,...,0,0,0,,,0,7,2006,WD,Normal
3,903232190,1040,123900,50,RM,52.0,6240,Pave,0,Reg,...,0,0,0,,,0,5,2010,WD,Normal
4,914452120,912,156000,85,RL,61.990202,7540,Pave,0,IR1,...,192,0,0,MnPrv,,0,6,2007,WD,Normal


In [6]:
categorical = ['MSZoning', 'MSSubClass','Street','Alley','LotShape','LandContour','LotConfig',
               'LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle',
               'RoofStyle','Foundation', 'BsmtFinType1','BsmtFinType2','Heating','GarageType',
               'GarageFinish','PavedDrive','MiscFeature','SaleType','SaleCondition',
               'BedroomAbvGr', 'CentralAir', 'Utilities', 'RoofMatl', 'Exterior1st',
               'Exterior2nd', 'MasVnrType', 'Electrical', 'Functional', 'Fence','KitchenAbvGr',
               'MoSold','YrSold'
              ]

In [7]:
[col for col in train.select_dtypes(['object','bool']).columns.to_list() if col not in categorical]

[]

In [8]:
cols_na = train.loc[:,train.isna().any(axis=0)].columns.to_list()
cols_na

[]

In [10]:
train['LogSalePrice'] = np.log(train['SalePrice'])

In [18]:
nhds = train.groupby('Neighborhood').median()[['LogSalePrice']]
nhds['LogSalePrice'] = stats.zscore(nhds['LogSalePrice'])

In [25]:
def segment(y):
    if round(y,2) < -0.75:
        return 0
    elif (round(y,2) >= -0.75) and (round(y,2) < 0.75):
        return 1
    else:
        return 2

In [30]:
nhds['Segment'] = nhds.apply(lambda x: segment(x['LogSalePrice']),axis = 1)
nhds.sort_values('LogSalePrice')
seg_dict = nhds.drop('LogSalePrice', axis=1).to_dict()['Segment']

In [31]:
train['Segment'] = train.apply(lambda x: seg_dict[x['Neighborhood']], axis=1)

In [33]:
X_train = train.drop(['SalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'LogSalePrice'], axis=1)
y_train = train.loc[:, ['LogSalePrice', 'Segment']]

In [35]:
def stacked_selector(X, y, selector_params):
    
    score_dict = {'score_0':0, 'score_1':0, 'score_2':0}
    select_dict = {'select_0':[], 'select_1':[], 'select_2':[]}
    
    for j in range(3):
              
            X_levj = pd.DataFrame(X.loc[X['Segment']==j, :].drop('Segment', axis=1))
            y_levj = y.loc[y['Segment']==j, :].drop('Segment', axis=1)

            transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                            remainder='passthrough')
            X_levj = transformer.fit_transform(X_levj)
            scaler = StandardScaler(with_mean=False)
            X_levj = scaler.fit_transform(X_levj)
                         
            lasso = Lasso(alpha=selector_params[j])
            selector = SelectFromModel(estimator=lasso)

            X_levj = selector.fit_transform(X_levj, y_levj)

            ols = LinearRegression()
            ols.fit(X_levj,y_levj)
            if ols.score(X_levj,y_levj) > score_dict[f'score_{j}']:
                score_dict[f'score_{j}'] = ols.score(X_levj,y_levj)
                mask = selector.get_support()
                feat_names = transformer.get_feature_names()
                select_dict[f'select_{j}'] = [name for name, boo in zip(feat_names, mask) if boo]

    return score_dict, select_dict

In [38]:
selections = stacked_selector(X_train, y_train, [0.01, 0.01, 0.01])
selections

({'score_0': 0.9055574207191642,
  'score_1': 0.9111792398515649,
  'score_2': 0.9411753647092863},
 {'select_0': ['Cat__x0_C (all)',
   'Cat__x0_RL',
   'Cat__x1_30',
   'Cat__x1_50',
   'Cat__x1_160',
   'Cat__x1_190',
   'Cat__x3_2',
   'Cat__x4_IR2',
   'Cat__x4_IR3',
   'Cat__x8_BrkSide',
   'Cat__x8_MeadowV',
   'Cat__x9_Feedr',
   'Cat__x9_Norm',
   'Cat__x11_1Fam',
   'Cat__x11_Twnhs',
   'Cat__x14_BrkTil',
   'Cat__x14_PConc',
   'Cat__x15_BLQ',
   'Cat__x19_3',
   'Cat__x20_0',
   'Cat__x20_2',
   'Cat__x21_Othr',
   'Cat__x22_ConLI',
   'Cat__x23_Normal',
   'Cat__x24_1',
   'Cat__x24_4',
   'Cat__x24_6',
   'Cat__x25_N',
   'Cat__x27_CompShg',
   'Cat__x28_AsbShng',
   'Cat__x28_BrkFace',
   'Cat__x28_PreCast',
   'Cat__x32_Maj2',
   'Cat__x32_Min2',
   'Cat__x32_Typ',
   'Cat__x35_3',
   'Cat__x35_11',
   'LotArea',
   'OverallQual',
   'OverallCond',
   'YearBuilt',
   'YearRemodAdd',
   'ExterQual',
   'BsmtQual',
   'BsmtExposure',
   'BsmtFinSF1',
   'HeatingQC',
   '1

In [21]:
selections

({'score_0': 0.9041310088000026,
  'score_1': 0.9166570198150846,
  'score_2': 0.8978075326008347},
 {'select_0': ['Cat__x0_RH',
   'Cat__x7_Gtl',
   'Cat__x22_New',
   'Cat__x25_N',
   'LotArea',
   'OverallQual',
   'YearRemodAdd',
   'ExterQual',
   'BsmtQual',
   'BsmtExposure',
   'BsmtFinSF1',
   'HeatingQC',
   '1stFlrSF',
   '2ndFlrSF',
   'HalfBath',
   'KitchenQual',
   'TotRmsAbvGrd',
   'Fireplaces',
   'FireplaceQu',
   'GarageCars',
   'GarageArea'],
  'select_1': ['Cat__x0_A (agr)',
   'Cat__x0_C (all)',
   'Cat__x0_I (all)',
   'Cat__x0_RH',
   'Cat__x0_RL',
   'Cat__x1_20',
   'Cat__x1_30',
   'Cat__x1_40',
   'Cat__x1_60',
   'Cat__x1_70',
   'Cat__x1_90',
   'Cat__x1_160',
   'Cat__x2_Grvl',
   'Cat__x3_1',
   'Cat__x3_2',
   'Cat__x4_IR2',
   'Cat__x4_IR3',
   'Cat__x5_HLS',
   'Cat__x5_Lvl',
   'Cat__x6_FR3',
   'Cat__x7_Sev',
   'Cat__x8_Blueste',
   'Cat__x8_BrDale',
   'Cat__x8_BrkSide',
   'Cat__x8_Edwards',
   'Cat__x8_IDOTRR',
   'Cat__x8_MeadowV',
   'Cat__x

In [22]:
X_train['NhdCluster'].value_counts()

1    969
0    852
2     50
Name: NhdCluster, dtype: int64

In [23]:
train['LogSalePrice'] = np.log(train['SalePrice'])

In [24]:
comp_dict = train.groupby(['Neighborhood', 'BedroomAbvGr', 'BldgType',
               'OverallQual', 'FullBath', 'KitchenQual', 'GarageCars']).mean()['LogSalePrice'].to_dict()

In [25]:
train['Comp'] = train.apply(lambda x: comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])],axis=1)

In [26]:
X_train = train.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_train = pd.DataFrame({'LogSalePrice':train['LogSalePrice'], 'NhdCluster':train['NhdCluster']})

In [27]:
selections = multilev_selector(X_train, y_train, [0.01, 0.01, 0.02])

In [28]:
selections[1]['select_0']

['Cat__x1_60',
 'Cat__x22_New',
 'OverallQual',
 'YearRemodAdd',
 'ExterQual',
 'BsmtFinSF1',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'OpenPorchSF',
 'Comp']

In [29]:
dict(enumerate(categorical))

{0: 'MSZoning',
 1: 'MSSubClass',
 2: 'Street',
 3: 'Alley',
 4: 'LotShape',
 5: 'LandContour',
 6: 'LotConfig',
 7: 'LandSlope',
 8: 'Neighborhood',
 9: 'Condition1',
 10: 'Condition2',
 11: 'BldgType',
 12: 'HouseStyle',
 13: 'RoofStyle',
 14: 'Foundation',
 15: 'BsmtFinType1',
 16: 'BsmtFinType2',
 17: 'Heating',
 18: 'GarageType',
 19: 'GarageFinish',
 20: 'PavedDrive',
 21: 'MiscFeature',
 22: 'SaleType',
 23: 'SaleCondition',
 24: 'BedroomAbvGr',
 25: 'CentralAir',
 26: 'Utilities',
 27: 'RoofMatl',
 28: 'Exterior1st',
 29: 'Exterior2nd',
 30: 'MasVnrType',
 31: 'Electrical',
 32: 'Functional',
 33: 'Fence',
 34: 'KitchenAbvGr',
 35: 'MoSold',
 36: 'YrSold'}

In [30]:
select_0 = ['MSSubClass',
 'SaleType',
 'OverallQual',
 'YearRemodAdd',
 'ExterQual',
 'BsmtFinSF1',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'OpenPorchSF',
 'Comp']

In [31]:
cats_0 = [col for col in select_0 if col in categorical]

In [32]:
X = X_train
X = X.loc[X['NhdCluster']==0, :].drop('NhdCluster', axis=1)
X = X[select_0]

y = y_train
y = y.loc[y['NhdCluster']==0, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_0)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=None),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('Cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['MSSubClass',
                                                                          'SaleType'])])),
                                       ('scaler', StandardScaler()),
                                       ('ridge', Ridge())]),
             n_jobs=-1, param_grid={'ridge__alpha': [0.001, 0.1, 1, 10]},
             scoring='r2')

In [33]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

[0.93834824 0.9383504  0.93836607 0.93820107]
{'ridge__alpha': 1}
0.9383660677864154


In [34]:
selections[1]['select_1']

['Cat__x1_30',
 'Cat__x15_None',
 'OverallCond',
 'YearRemodAdd',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 '1stFlrSF',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'WoodDeckSF',
 'ScreenPorch',
 'Comp']

In [35]:
dict(enumerate(categorical))

{0: 'MSZoning',
 1: 'MSSubClass',
 2: 'Street',
 3: 'Alley',
 4: 'LotShape',
 5: 'LandContour',
 6: 'LotConfig',
 7: 'LandSlope',
 8: 'Neighborhood',
 9: 'Condition1',
 10: 'Condition2',
 11: 'BldgType',
 12: 'HouseStyle',
 13: 'RoofStyle',
 14: 'Foundation',
 15: 'BsmtFinType1',
 16: 'BsmtFinType2',
 17: 'Heating',
 18: 'GarageType',
 19: 'GarageFinish',
 20: 'PavedDrive',
 21: 'MiscFeature',
 22: 'SaleType',
 23: 'SaleCondition',
 24: 'BedroomAbvGr',
 25: 'CentralAir',
 26: 'Utilities',
 27: 'RoofMatl',
 28: 'Exterior1st',
 29: 'Exterior2nd',
 30: 'MasVnrType',
 31: 'Electrical',
 32: 'Functional',
 33: 'Fence',
 34: 'KitchenAbvGr',
 35: 'MoSold',
 36: 'YrSold'}

In [36]:
select_1 = ['MSSubClass',
 'BsmtFinType1',
 'OverallCond',
 'YearRemodAdd',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 '1stFlrSF',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'WoodDeckSF',
 'ScreenPorch',
 'Comp']

In [37]:
cats_1 = [col for col in select_1 if col in categorical]

In [38]:
X = X_train
X = X.loc[X['NhdCluster']==1, :].drop('NhdCluster', axis=1)
X = X[select_1]

y = y_train
y = y.loc[y['NhdCluster']==1, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_1)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=None),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('Cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['MSSubClass',
                                                                          'BsmtFinType1'])])),
                                       ('scaler', StandardScaler()),
                                       ('ridge', Ridge())]),
             n_jobs=-1, param_grid={'ridge__alpha': [0.001, 0.1, 1, 10]},
             scoring='r2')

In [39]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

[0.92405171 0.92405249 0.92405724 0.92389174]
{'ridge__alpha': 1}
0.924057236954064


In [40]:
selections[1]['select_2']

['Cat__x15_Unf',
 'LotArea',
 'MasVnrArea',
 'BsmtFinSF1',
 '1stFlrSF',
 '2ndFlrSF',
 'TotRmsAbvGrd',
 'GarageArea',
 'Comp']

In [41]:
dict(enumerate(categorical))

{0: 'MSZoning',
 1: 'MSSubClass',
 2: 'Street',
 3: 'Alley',
 4: 'LotShape',
 5: 'LandContour',
 6: 'LotConfig',
 7: 'LandSlope',
 8: 'Neighborhood',
 9: 'Condition1',
 10: 'Condition2',
 11: 'BldgType',
 12: 'HouseStyle',
 13: 'RoofStyle',
 14: 'Foundation',
 15: 'BsmtFinType1',
 16: 'BsmtFinType2',
 17: 'Heating',
 18: 'GarageType',
 19: 'GarageFinish',
 20: 'PavedDrive',
 21: 'MiscFeature',
 22: 'SaleType',
 23: 'SaleCondition',
 24: 'BedroomAbvGr',
 25: 'CentralAir',
 26: 'Utilities',
 27: 'RoofMatl',
 28: 'Exterior1st',
 29: 'Exterior2nd',
 30: 'MasVnrType',
 31: 'Electrical',
 32: 'Functional',
 33: 'Fence',
 34: 'KitchenAbvGr',
 35: 'MoSold',
 36: 'YrSold'}

In [42]:
select_2 = ['BsmtFinType1',
 'LotArea',
 'MasVnrArea',
 'BsmtFinSF1',
 '1stFlrSF',
 '2ndFlrSF',
 'TotRmsAbvGrd',
 'GarageArea',
 'Comp']

In [43]:
cats_2 = [col for col in select_2 if col in categorical]

In [44]:
X = X_train
X = X.loc[X['NhdCluster']==2, :].drop('NhdCluster', axis=1)
X = X[select_1]

y = y_train
y = y.loc[y['NhdCluster']==2, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_2)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=None),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('Cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['BsmtFinType1'])])),
                                       ('scaler', StandardScaler()),
                                       ('ridge', Ridge())]),
             n_jobs=-1, param_grid={'ridge__alpha': [0.001, 0.1, 1, 10]},
             scoring='r2')

In [45]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

[0.70917849 0.71308191 0.73281402 0.72374198]
{'ridge__alpha': 1}
0.7328140180714782


# Below we start again with only two clusters :(

In [46]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [47]:
nhds = train.loc[:,['Neighborhood', 'GrLivArea']]
nhds['LogSalePrice'] = np.log(train['SalePrice'])

In [48]:
nhds = nhds.groupby('Neighborhood').agg(
    Sqft_med=pd.NamedAgg('GrLivArea',np.median),
    LogPrice_med=pd.NamedAgg('LogSalePrice',np.median)
).fillna(0)

In [49]:
weights = train.groupby('Neighborhood').count().apply(lambda x: x['PID']/len(train) ,axis=1).to_list()

In [50]:
X = nhds
scaler = StandardScaler()
X = scaler.fit_transform(X)

clusterer = KMeans(n_clusters=2, random_state=42)
cluster_labels = clusterer.fit_predict(X)
nhds['Cluster'] = cluster_labels
nhds.sort_values('Cluster')

Unnamed: 0_level_0,Sqft_med,LogPrice_med,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MeadowV,1092.0,11.449986,0
NAmes,1203.0,11.858285,0
Mitchel,1286.0,11.973188,0
OldTown,1378.0,11.695247,0
Landmrk,1320.0,11.827736,0
IDOTRR,1330.0,11.691908,0
SWISU,1516.0,11.816352,0
Greens,1226.0,12.248806,0
Sawyer,1040.0,11.81303,0
Edwards,1198.0,11.7464,0


In [51]:
cluster_dict = pd.DataFrame(data = {'Nhd_cluster':nhds.Cluster}, index=nhds.index).to_dict()['Nhd_cluster']

In [52]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [53]:
X_train = train.drop(['SalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'],axis=1)
y_train = pd.DataFrame({'LogSalePrice':np.log(train['SalePrice']), 'NhdCluster':train['NhdCluster']})

In [54]:
def multilev_selector_2(X, y, selector_params):
    
    score_dict = {'score_0':0, 'score_1':0}
    select_dict = {'select_0':[], 'select_1':[]}
    
    for j in range(2):
              
            X_levj = pd.DataFrame(X.loc[X['NhdCluster']==j, :].drop('NhdCluster', axis=1))
            y_levj = y.loc[y['NhdCluster']==j, :].drop('NhdCluster', axis=1)

            transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                            remainder='passthrough')
            X_levj = transformer.fit_transform(X_levj)
            scaler = StandardScaler(with_mean=False)
            X_levj = scaler.fit_transform(X_levj)
                         
            lasso = Lasso(alpha=selector_params[j])
            selector = SelectFromModel(estimator=lasso)

            X_levj = selector.fit_transform(X_levj, y_levj)

            ols = LinearRegression()
            ols.fit(X_levj,y_levj)
            if ols.score(X_levj,y_levj) > score_dict[f'score_{j}']:
                score_dict[f'score_{j}'] = ols.score(X_levj,y_levj)
                mask = selector.get_support()
                feat_names = transformer.get_feature_names()
                select_dict[f'select_{j}'] = [name for name, boo in zip(feat_names, mask) if boo]

    return score_dict, select_dict

In [55]:
X_train['NhdCluster'].value_counts()

0    975
1    896
Name: NhdCluster, dtype: int64

In [56]:
train['LogSalePrice'] = np.log(train['SalePrice'])

In [57]:
comp_dict = train.groupby(['Neighborhood', 'BedroomAbvGr', 'BldgType',
               'OverallQual', 'FullBath', 'KitchenQual', 'GarageCars']).mean()['LogSalePrice'].to_dict()

In [58]:
train['Comp'] = train.apply(lambda x: comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])],axis=1)

In [59]:
X_train = train.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_train = pd.DataFrame({'LogSalePrice':train['LogSalePrice'], 'NhdCluster':train['NhdCluster']})

In [60]:
selections = multilev_selector_2(X_train, y_train, [0.02, 0.009])
selections

({'score_0': 0.9144695812365655, 'score_1': 0.9487388452922789},
 {'select_0': ['BsmtFinSF1', '1stFlrSF', 'Fireplaces', 'Comp'],
  'select_1': ['Cat__x1_60',
   'Cat__x9_Feedr',
   'Cat__x22_New',
   'OverallQual',
   'YearRemodAdd',
   'MasVnrArea',
   'ExterQual',
   'BsmtFinSF1',
   'HeatingQC',
   '1stFlrSF',
   '2ndFlrSF',
   'BsmtFullBath',
   'HalfBath',
   'TotRmsAbvGrd',
   'Fireplaces',
   'FireplaceQu',
   'GarageArea',
   'OpenPorchSF',
   'Comp']})

In [61]:
select_0 = selections[1]['select_0']
select_0

['BsmtFinSF1', '1stFlrSF', 'Fireplaces', 'Comp']

In [62]:
cats_0 = [col for col in select_0 if col in categorical]

In [63]:
X = X_train
X = X.loc[X['NhdCluster']==0, :].drop('NhdCluster', axis=1)
X = X[select_0]

y = y_train
y = y.loc[y['NhdCluster']==0, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_0)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=None),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('Cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         [])])),
                                       ('scaler', StandardScaler()),
                                       ('ridge', Ridge())]),
             n_jobs=-1, param_grid={'ridge__alpha': [0.001, 0.1, 1, 10]},
             scoring='r2')

In [64]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

[0.91178927 0.9117895  0.91178976 0.91161645]
{'ridge__alpha': 1}
0.9117897563749553


In [65]:
selections[1]['select_1']

['Cat__x1_60',
 'Cat__x9_Feedr',
 'Cat__x22_New',
 'OverallQual',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'BsmtFinSF1',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageArea',
 'OpenPorchSF',
 'Comp']

In [66]:
dict(enumerate(categorical))

{0: 'MSZoning',
 1: 'MSSubClass',
 2: 'Street',
 3: 'Alley',
 4: 'LotShape',
 5: 'LandContour',
 6: 'LotConfig',
 7: 'LandSlope',
 8: 'Neighborhood',
 9: 'Condition1',
 10: 'Condition2',
 11: 'BldgType',
 12: 'HouseStyle',
 13: 'RoofStyle',
 14: 'Foundation',
 15: 'BsmtFinType1',
 16: 'BsmtFinType2',
 17: 'Heating',
 18: 'GarageType',
 19: 'GarageFinish',
 20: 'PavedDrive',
 21: 'MiscFeature',
 22: 'SaleType',
 23: 'SaleCondition',
 24: 'BedroomAbvGr',
 25: 'CentralAir',
 26: 'Utilities',
 27: 'RoofMatl',
 28: 'Exterior1st',
 29: 'Exterior2nd',
 30: 'MasVnrType',
 31: 'Electrical',
 32: 'Functional',
 33: 'Fence',
 34: 'KitchenAbvGr',
 35: 'MoSold',
 36: 'YrSold'}

In [67]:
select_1 = ['MasVnrType',
 'BsmtFinType1',
 'OverallCond',
 'YearRemodAdd',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 '1stFlrSF',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'WoodDeckSF',
 'ScreenPorch',
 'Comp']

In [68]:
cats_1 = [col for col in select_1 if col in categorical]

In [69]:
X = X_train
X = X.loc[X['NhdCluster']==1, :].drop('NhdCluster', axis=1)
X = X[select_1]

y = y_train
y = y.loc[y['NhdCluster']==1, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_1)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=None),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('Cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['MasVnrType',
                                                                          'BsmtFinType1'])])),
                                       ('scaler', StandardScaler()),
                                       ('ridge', Ridge())]),
             n_jobs=-1, param_grid={'ridge__alpha': [0.001, 0.1, 1, 10]},
             scoring='r2')

In [70]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

[0.93541279 0.93541267 0.93540913 0.9351601 ]
{'ridge__alpha': 0.001}
0.9354127890041376


# Two clusters based on price only (moving Greens)

In [71]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [72]:
nhds = train.loc[:,['Neighborhood', 'GrLivArea']]
nhds['LogSalePrice'] = np.log(train['SalePrice'])

In [73]:
nhds = nhds.groupby('Neighborhood').agg(
    Sqft_med=pd.NamedAgg('GrLivArea',np.median),
    LogPrice_med=pd.NamedAgg('LogSalePrice',np.median)
).fillna(0)

In [74]:
weights = train.groupby('Neighborhood').count().apply(lambda x: x['PID']/len(train) ,axis=1).to_list()

In [75]:
X = nhds
scaler = StandardScaler()
X = scaler.fit_transform(X)

clusterer = KMeans(n_clusters=2, random_state=42)
cluster_labels = clusterer.fit_predict(X)
nhds['Cluster'] = cluster_labels
nhds.sort_values('Cluster')

Unnamed: 0_level_0,Sqft_med,LogPrice_med,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MeadowV,1092.0,11.449986,0
NAmes,1203.0,11.858285,0
Mitchel,1286.0,11.973188,0
OldTown,1378.0,11.695247,0
Landmrk,1320.0,11.827736,0
IDOTRR,1330.0,11.691908,0
SWISU,1516.0,11.816352,0
Greens,1226.0,12.248806,0
Sawyer,1040.0,11.81303,0
Edwards,1198.0,11.7464,0


In [80]:
scaler = StandardScaler()
nhds_scaled = nhds.sort_values('Cluster') 
nhds_scaled = scaler.fit_transform(nhds_scaled)
nhds_scaled = pd.DataFrame(nhds_scaled, index=nhds.sort_values('Cluster').index).rename(columns={0:'Sqft_z', 1:'Price_z', 2:'Cluster_z'})
nhds_scaled.sort_values('Price_z')

Unnamed: 0_level_0,Sqft_z,Price_z,Cluster_z
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MeadowV,-1.180127,-1.852867,-1.0
BrDale,-1.392581,-1.499827,-1.0
IDOTRR,-0.364575,-1.119626,-1.0
OldTown,-0.200094,-1.109506,-1.0
Blueste,-1.091033,-1.011346,-1.0
Edwards,-0.816898,-0.954467,-1.0
BrkSide,-0.799764,-0.949625,-1.0
Sawyer,-1.358315,-0.752519,-1.0
SWISU,0.272789,-0.742449,-1.0
Landmrk,-0.398842,-0.707946,-1.0


In [82]:
cluster_dict = pd.DataFrame(data = {'Nhd_cluster':nhds.Cluster}, index=nhds.index).to_dict()['Nhd_cluster']
cluster_dict['Greens'] = 1
cluster_dict

{'Blmngtn': 1,
 'Blueste': 0,
 'BrDale': 0,
 'BrkSide': 0,
 'ClearCr': 1,
 'CollgCr': 1,
 'Crawfor': 1,
 'Edwards': 0,
 'Gilbert': 1,
 'Greens': 1,
 'GrnHill': 1,
 'IDOTRR': 0,
 'Landmrk': 0,
 'MeadowV': 0,
 'Mitchel': 0,
 'NAmes': 0,
 'NPkVill': 0,
 'NWAmes': 1,
 'NoRidge': 1,
 'NridgHt': 1,
 'OldTown': 0,
 'SWISU': 0,
 'Sawyer': 0,
 'SawyerW': 1,
 'Somerst': 1,
 'StoneBr': 1,
 'Timber': 1,
 'Veenker': 1}

In [83]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [84]:
X_train = train.drop(['SalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'],axis=1)
y_train = pd.DataFrame({'LogSalePrice':np.log(train['SalePrice']), 'NhdCluster':train['NhdCluster']})

In [85]:
X_train['NhdCluster'].value_counts()

0    969
1    902
Name: NhdCluster, dtype: int64

In [86]:
train['LogSalePrice'] = np.log(train['SalePrice'])

In [87]:
comp_dict = train.groupby(['Neighborhood', 'BedroomAbvGr', 'BldgType',
               'OverallQual', 'FullBath', 'KitchenQual', 'GarageCars']).mean()['LogSalePrice'].to_dict()

In [88]:
train['Comp'] = train.apply(lambda x: comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])],axis=1)

In [89]:
X_train = train.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_train = pd.DataFrame({'LogSalePrice':train['LogSalePrice'], 'NhdCluster':train['NhdCluster']})

In [94]:
selections = multilev_selector_2(X_train, y_train, [0.007, 0.01])
selections

({'score_0': 0.9306472083554148, 'score_1': 0.9475777585703508},
 {'select_0': ['Cat__x1_30',
   'Cat__x15_None',
   'Cat__x28_BrkFace',
   'OverallCond',
   'YearRemodAdd',
   'BsmtQual',
   'BsmtExposure',
   'BsmtFinSF1',
   '1stFlrSF',
   '2ndFlrSF',
   'BsmtFullBath',
   'HalfBath',
   'TotRmsAbvGrd',
   'Fireplaces',
   'FireplaceQu',
   'GarageArea',
   'WoodDeckSF',
   'ScreenPorch',
   'Comp'],
  'select_1': ['Cat__x1_60',
   'Cat__x22_New',
   'OverallQual',
   'YearRemodAdd',
   'MasVnrArea',
   'ExterQual',
   'BsmtFinSF1',
   'HeatingQC',
   '1stFlrSF',
   '2ndFlrSF',
   'BsmtFullBath',
   'HalfBath',
   'TotRmsAbvGrd',
   'Fireplaces',
   'FireplaceQu',
   'GarageArea',
   'OpenPorchSF',
   'Comp']})

In [95]:
select_0 = selections[1]['select_0']
select_0

['Cat__x1_30',
 'Cat__x15_None',
 'Cat__x28_BrkFace',
 'OverallCond',
 'YearRemodAdd',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageArea',
 'WoodDeckSF',
 'ScreenPorch',
 'Comp']

In [96]:
select_0 = ['MSSubClass',
 'BsmtFinType1',
 'Exterior1st',
 'OverallCond',
 'YearRemodAdd',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageArea',
 'WoodDeckSF',
 'ScreenPorch',
 'Comp']

In [66]:
dict(enumerate(categorical))

{0: 'MSZoning',
 1: 'MSSubClass',
 2: 'Street',
 3: 'Alley',
 4: 'LotShape',
 5: 'LandContour',
 6: 'LotConfig',
 7: 'LandSlope',
 8: 'Neighborhood',
 9: 'Condition1',
 10: 'Condition2',
 11: 'BldgType',
 12: 'HouseStyle',
 13: 'RoofStyle',
 14: 'Foundation',
 15: 'BsmtFinType1',
 16: 'BsmtFinType2',
 17: 'Heating',
 18: 'GarageType',
 19: 'GarageFinish',
 20: 'PavedDrive',
 21: 'MiscFeature',
 22: 'SaleType',
 23: 'SaleCondition',
 24: 'BedroomAbvGr',
 25: 'CentralAir',
 26: 'Utilities',
 27: 'RoofMatl',
 28: 'Exterior1st',
 29: 'Exterior2nd',
 30: 'MasVnrType',
 31: 'Electrical',
 32: 'Functional',
 33: 'Fence',
 34: 'KitchenAbvGr',
 35: 'MoSold',
 36: 'YrSold'}

In [97]:
cats_0 = [col for col in select_0 if col in categorical]

In [98]:
X = X_train
X = X.loc[X['NhdCluster']==0, :].drop('NhdCluster', axis=1)
X = X[select_0]

y = y_train
y = y.loc[y['NhdCluster']==0, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_0)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=None),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('Cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['MSSubClass',
                                                                          'BsmtFinType1',
                                                                          'Exterior1st'])])),
                                       ('scaler', StandardScaler()),
                                       ('ridge', Ridge())]),
             n_jobs=-1, param_grid={'ridge__alpha': [0.001, 0.1, 1, 10]},
             scoring='r2')

In [99]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

[0.92497007 0.92497124 0.92497909 0.92481744]
{'ridge__alpha': 1}
0.9249790899345172


In [100]:
selections[1]['select_1']

['Cat__x1_60',
 'Cat__x22_New',
 'OverallQual',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'BsmtFinSF1',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageArea',
 'OpenPorchSF',
 'Comp']

In [101]:
select_1 = ['MSSubClass',
 'SaleType',
 'OverallQual',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'BsmtFinSF1',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageArea',
 'OpenPorchSF',
 'Comp']

In [102]:
dict(enumerate(categorical))

{0: 'MSZoning',
 1: 'MSSubClass',
 2: 'Street',
 3: 'Alley',
 4: 'LotShape',
 5: 'LandContour',
 6: 'LotConfig',
 7: 'LandSlope',
 8: 'Neighborhood',
 9: 'Condition1',
 10: 'Condition2',
 11: 'BldgType',
 12: 'HouseStyle',
 13: 'RoofStyle',
 14: 'Foundation',
 15: 'BsmtFinType1',
 16: 'BsmtFinType2',
 17: 'Heating',
 18: 'GarageType',
 19: 'GarageFinish',
 20: 'PavedDrive',
 21: 'MiscFeature',
 22: 'SaleType',
 23: 'SaleCondition',
 24: 'BedroomAbvGr',
 25: 'CentralAir',
 26: 'Utilities',
 27: 'RoofMatl',
 28: 'Exterior1st',
 29: 'Exterior2nd',
 30: 'MasVnrType',
 31: 'Electrical',
 32: 'Functional',
 33: 'Fence',
 34: 'KitchenAbvGr',
 35: 'MoSold',
 36: 'YrSold'}

In [103]:
cats_1 = [col for col in select_1 if col in categorical]

In [104]:
X = X_train
X = X.loc[X['NhdCluster']==1, :].drop('NhdCluster', axis=1)
X = X[select_1]

y = y_train
y = y.loc[y['NhdCluster']==1, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_1)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=None),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('Cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['MSSubClass',
                                                                          'SaleType'])])),
                                       ('scaler', StandardScaler()),
                                       ('ridge', Ridge())]),
             n_jobs=-1, param_grid={'ridge__alpha': [0.001, 0.1, 1, 10]},
             scoring='r2')

In [105]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

[0.9432292  0.94323119 0.94324557 0.94308128]
{'ridge__alpha': 1}
0.9432455705290981
