In [3]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import xgboost as xgb
from bayes_opt import BayesianOptimization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import helper

In [4]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [5]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [6]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [9]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [7]:
categorical = train.select_dtypes(['object','bool']).columns.to_list() + ['MSSubClass']

In [8]:
# # categorical = ['MSZoning', 'MSSubClass','Street','Alley','LotShape','LandContour','LotConfig',
#                'LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle',
#                'RoofStyle','Foundation', 'BsmtFinType1','BsmtFinType2','Heating','GarageType',
#                'GarageFinish','PavedDrive','MiscFeature','SaleType','SaleCondition',
#                'BedroomAbvGr', 'CentralAir', 'Utilities', 'RoofMatl', 'Exterior1st',
#                'Exterior2nd', 'MasVnrType', 'Electrical', 'Functional', 'Fence','KitchenAbvGr',
#                'MoSold','YrSold'
#               ]

In [10]:
X_train = train.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_train = train['LogSalePrice']
X_test = test.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_test = test['LogSalePrice']

# XGBoost without Bayesian optimization (OneHotEncoder)

In [20]:
transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                remainder='passthrough')

X = transformer.fit_transform(X_train)
y = y_train

boost = xgb.XGBRegressor()

cv = RepeatedKFold(n_splits=4, n_repeats=5, shuffle=True, random_state=42)

scores0 = cross_val_score(boost, X, y, scoring='r2', cv=cv, n_jobs=-1)
scores1 = cross_val_score(boost, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

scores1 = abs(scores1)

print('R2 mean: %.3f (R2 sd: %.3f)' % (scores0.mean(), scores0.std()))
print('RMSE mean: %.3f (RMSE sd: %.3f)' % (scores1.mean(), scores1.std()))

R2 mean: 0.886 (R2 sd: 0.025)
RMSE mean: 0.103 (RMSE sd: 0.011)


In [21]:
X_lev1 = pd.DataFrame(X_train.loc[X_train['NhdCluster']==1, :].drop('NhdCluster', axis=1))
y_lev1 = y_train.loc[y_train['NhdCluster']==1, :].drop('NhdCluster', axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                remainder='passthrough')

X_lev1 = transformer.fit_transform(X_lev1)

X, y = X_lev1, y_lev1

boost = xgb.XGBRegressor()

param_grid = {
    'n_estimators':[1,10,100],
    'max_depth':[20,50],
    'eta':[0.1,0.3,0.5]
}

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

grid = GridSearchCV(boost, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=10, random_state=42),
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
            

In [22]:
grid.cv_results_['mean_test_score']

array([-1.20317321e+03, -1.80953274e+02,  8.93387790e-01, -1.20317321e+03,
       -1.80953274e+02,  8.92286597e-01, -7.28372498e+02, -4.65068418e-01,
        8.70241491e-01, -7.28372498e+02, -4.65068418e-01,  8.71306179e-01,
       -3.72103737e+02,  8.15080429e-01,  8.27188642e-01, -3.72103737e+02,
        8.15717915e-01,  8.27299076e-01])

In [23]:
print(grid.best_score_)
print(grid.best_params_)

0.8933877902308144
{'eta': 0.1, 'max_depth': 20, 'n_estimators': 100}


In [24]:
grid.cv_results_

{'mean_fit_time': array([0.04962801, 0.10109054, 5.32231897, 0.01510062, 0.04586535,
        5.57365414, 0.016142  , 0.09476926, 2.39631334, 0.01562136,
        0.11872209, 2.58376935, 0.01614227, 0.39782348, 1.58660729,
        0.01510052, 0.41761043, 1.62201571]),
 'std_fit_time': array([0.01732489, 0.03774339, 0.58303551, 0.00280412, 0.00803278,
        0.64019871, 0.00491245, 0.00560824, 0.18839121, 0.0040334 ,
        0.01182822, 0.07879107, 0.00280402, 0.04767848, 0.08213524,
        0.0028041 , 0.04754189, 0.16364342]),
 'mean_score_time': array([0.02093448, 0.00944078, 0.01061532, 0.00833135, 0.00724804,
        0.00983827, 0.00885224, 0.00885205, 0.00729   , 0.00729   ,
        0.00833157, 0.00885198, 0.00676916, 0.00624846, 0.00781074,
        0.00676935, 0.00624853, 0.00676908]),
 'std_score_time': array([0.00866257, 0.00454361, 0.00550777, 0.00779326, 0.00624794,
        0.00749159, 0.00774106, 0.0077409 , 0.00779334, 0.00779334,
        0.00779347, 0.00774083, 0.00774083, 

In [25]:
split_scores_best_params = [grid.cv_results_[f'split{j}_test_score'][grid.best_index_] for j in range(30)]
split_scores_best_params

[0.9021354414324422,
 0.9203052466148268,
 0.9295534858545904,
 0.8953794757357484,
 0.8818877456904464,
 0.8945884409252188,
 0.8636138733271731,
 0.842926151280365,
 0.9139152505367595,
 0.8797688538274939,
 0.87972850852065,
 0.9063915834946734,
 0.897371822474615,
 0.8851732050329469,
 0.9194485236155613,
 0.8706247508185866,
 0.888529731772314,
 0.9030983567546818,
 0.9192208551335039,
 0.8745036468331083,
 0.911497572601394,
 0.8723004518991375,
 0.8812523687833839,
 0.9014205136817683,
 0.8786505210926323,
 0.8714364692813373,
 0.8959356550278209,
 0.9315991186487997,
 0.9031452305247738,
 0.8862308557076761]

In [26]:
np.mean(split_scores_best_params)

0.8933877902308144

# XGBoost without Bayesian optimization (OrdinalEncoder)

In [27]:
X_lev1 = pd.DataFrame(X_train.loc[X_train['NhdCluster']==1, :].drop('NhdCluster', axis=1))
y_lev1 = y_train.loc[y_train['NhdCluster']==1, :].drop('NhdCluster', axis=1)

transformer = ColumnTransformer([("Cat", OrdinalEncoder(), categorical)], 
                                remainder='passthrough')

X_lev1 = transformer.fit_transform(X_lev1)

X, y = X_lev1, y_lev1

boost = xgb.XGBRegressor()

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

scores0 = cross_val_score(boost, X, y, scoring='r2', cv=cv, n_jobs=-1)
scores1 = cross_val_score(boost, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

scores1 = abs(scores1)

print('R2 mean: %.3f (R2 sd: %.3f)' % (scores0.mean(), scores0.std()))
print('RMSE mean: %.3f (RMSE sd: %.3f)' % (scores1.mean(), scores1.std()))

R2 mean: 0.879 (R2 sd: 0.024)
RMSE mean: 0.106 (RMSE sd: 0.010)


In [28]:
X_lev1 = pd.DataFrame(X_train.loc[X_train['NhdCluster']==1, :].drop('NhdCluster', axis=1))
y_lev1 = y_train.loc[y_train['NhdCluster']==1, :].drop('NhdCluster', axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                remainder='passthrough')

X_lev1 = transformer.fit_transform(X_lev1)

X, y = X_lev1, y_lev1

boost = xgb.XGBRegressor()

param_grid = {
    'n_estimators':[1,10,100],
    'max_depth':[20,50],
    'eta':[0.1,0.3,0.5]
}

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

grid = GridSearchCV(boost, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

GridSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=10, random_state=42),
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
            

In [29]:
grid.cv_results_['mean_test_score']

array([-1.20317321e+03, -1.80953274e+02,  8.93387790e-01, -1.20317321e+03,
       -1.80953274e+02,  8.92286597e-01, -7.28372498e+02, -4.65068418e-01,
        8.70241491e-01, -7.28372498e+02, -4.65068418e-01,  8.71306179e-01,
       -3.72103737e+02,  8.15080429e-01,  8.27188642e-01, -3.72103737e+02,
        8.15717915e-01,  8.27299076e-01])

In [30]:
print(grid.best_score_)
print(grid.best_params_)

0.8933877902308144
{'eta': 0.1, 'max_depth': 20, 'n_estimators': 100}


In [31]:
grid.cv_results_

{'mean_fit_time': array([0.06035566, 0.15774478, 5.85250514, 0.01484338, 0.05261375,
        5.2116305 , 0.01770441, 0.10116136, 2.83343139, 0.01540083,
        0.15192143, 3.63477524, 0.02573116, 0.55442383, 1.75195542,
        0.01562139, 0.44342097, 1.69985882]),
 'std_fit_time': array([1.37104039e-02, 1.94574322e-02, 8.06813872e-01, 4.75432750e-03,
        8.89469746e-03, 3.67879311e-01, 5.31016637e-03, 1.28637997e-02,
        6.79530020e-01, 3.38222489e-03, 3.64777784e-02, 4.46863623e-01,
        6.47711148e-03, 9.64703025e-02, 1.93762953e-01, 5.06635585e-07,
        2.62948980e-02, 1.82325145e-01]),
 'mean_score_time': array([0.02430034, 0.01472876, 0.01100345, 0.00697989, 0.00758309,
        0.00781061, 0.00624837, 0.0087527 , 0.01040379, 0.00904829,
        0.00697018, 0.01124968, 0.01053765, 0.01253215, 0.00797727,
        0.00781074, 0.00758976, 0.0087647 ]),
 'std_score_time': array([0.01124602, 0.0028283 , 0.00845624, 0.00736632, 0.00472902,
        0.00781061, 0.00765266, 

In [32]:
split_scores_best_params = [grid.cv_results_[f'split{j}_test_score'][grid.best_index_] for j in range(30)]
split_scores_best_params

[0.9021354414324422,
 0.9203052466148268,
 0.9295534858545904,
 0.8953794757357484,
 0.8818877456904464,
 0.8945884409252188,
 0.8636138733271731,
 0.842926151280365,
 0.9139152505367595,
 0.8797688538274939,
 0.87972850852065,
 0.9063915834946734,
 0.897371822474615,
 0.8851732050329469,
 0.9194485236155613,
 0.8706247508185866,
 0.888529731772314,
 0.9030983567546818,
 0.9192208551335039,
 0.8745036468331083,
 0.911497572601394,
 0.8723004518991375,
 0.8812523687833839,
 0.9014205136817683,
 0.8786505210926323,
 0.8714364692813373,
 0.8959356550278209,
 0.9315991186487997,
 0.9031452305247738,
 0.8862308557076761]

In [33]:
np.mean(split_scores_best_params)

0.8933877902308144

# XGBoost with Bayesian Optimization (OneHotEncoder)

In [13]:
xgb.set_config(verbosity=0)

In [14]:
def bo_tune_xgb(max_depth, gamma, n_estimators, learning_rate):
    params = {'max_depth':int(max_depth), 
              'gamma':gamma, 
             'n_estimators':int(n_estimators), 
             'learning_rate':learning_rate,
             'subsample': 0.8,
             'eta':0.1,
             'eval_metric':'rmse'}
    cv_result = xgb.cv(params, dtrain, nfold=5, seed=42)
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [23]:
transformer = ColumnTransformer([('Cat',OneHotEncoder(handle_unknown='ignore'), categorical)], remainder='passthrough')   
X = transformer.fit_transform(X_train)

y = y_train

dtrain = xgb.DMatrix(X, y, enable_categorical=True)



xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth':(3,10), 
                                           'gamma':(0,1),
                                           'learning_rate':(0,1),
                                           'n_estimators':(100,120)},
                             random_state = 42)

xgb_bo.maximize(n_iter=5, init_points=8, acq='ei')

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1602  [0m | [0m 0.3745  [0m | [0m 0.9507  [0m | [0m 8.124   [0m | [0m 112.0   [0m |
| [0m 2       [0m | [0m-2.136   [0m | [0m 0.156   [0m | [0m 0.156   [0m | [0m 3.407   [0m | [0m 117.3   [0m |
| [95m 3       [0m | [95m-0.1535  [0m | [95m 0.6011  [0m | [95m 0.7081  [0m | [95m 3.144   [0m | [95m 119.4   [0m |
| [0m 4       [0m | [0m-1.084   [0m | [0m 0.8324  [0m | [0m 0.2123  [0m | [0m 4.273   [0m | [0m 103.7   [0m |
| [95m 5       [0m | [95m-0.1411  [0m | [95m 0.3042  [0m | [95m 0.5248  [0m | [95m 6.024   [0m | [95m 105.8   [0m |
| [0m 6       [0m | [0m-2.586   [0m | [0m 0.6119  [0m | [0m 0.1395  [0m | [0m 5.045   [0m | [0m 107.3   [0m |
| [0m 7       [0m | [0m-0.1499  [0m | [0m 0.4561  [0m | [0m 0.7852  [0m | [0m 4.398   [0m | [0m 1

In [24]:
params = xgb_bo.max['params']
print(params)

{'gamma': 0.26217155174161544, 'learning_rate': 0.49115317431045047, 'max_depth': 5.93465859048152, 'n_estimators': 105.82458656462529}


In [25]:
params['max_depth']= int(params['max_depth'])
params['n_estimators']= int(params['n_estimators'])

In [26]:
boost = xgb.XGBRegressor(**params).fit(X,y)
predicts = boost.predict(X)

In [27]:
r2_score(y,predicts)

0.9209476517322885

In [28]:
X_tst = transformer.transform(X_test)
predicts_tst = boost.predict(X_tst)
r2_score(y_test,predicts_tst)

0.8682456181880124

# XGBoost with Bayesian Optimization (OrdinalEncoder)

In [41]:
X = pd.DataFrame(X_train.loc[X_train['NhdCluster']==1, :].drop('NhdCluster', axis=1))
for col in X.loc[:,X.dtypes=='object'].columns:
    X[col] = X[col].astype("category")
cats = X.loc[:,X.dtypes=='category'].columns.to_list()
transformer = ColumnTransformer([('Cat',OrdinalEncoder(),cats)], remainder='passthrough')   
X = transformer.fit_transform(X)

In [42]:
xgb.set_config(verbosity=0)

In [43]:
y = y_train.loc[y_train['NhdCluster']==1, :].drop('NhdCluster', axis=1)

dtrain = xgb.DMatrix(X, y, enable_categorical=True)

def bo_tune_xgb(max_depth, gamma, n_estimators, learning_rate):
    params = {'max_depth':int(max_depth), 
              'gamma':gamma, 
             'n_estimators':int(n_estimators), 
             'learning_rate':learning_rate,
             'subsample': 0.8,
             'eta':0.1,
             'eval_metric':'rmse'}
    cv_result = xgb.cv(params, dtrain, nfold=5, seed=42)
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth':(3,10), 
                                           'gamma':(0,1),
                                           'learning_rate':(0,1),
                                           'n_estimators':(100,120)},
                             random_state = 42)

xgb_bo.maximize(n_iter=5, init_points=8, acq='ei')

|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.153   [0m | [0m 0.3745  [0m | [0m 0.9507  [0m | [0m 8.124   [0m | [0m 112.0   [0m |
| [0m 2       [0m | [0m-2.185   [0m | [0m 0.156   [0m | [0m 0.156   [0m | [0m 3.407   [0m | [0m 117.3   [0m |
| [95m 3       [0m | [95m-0.1513  [0m | [95m 0.6011  [0m | [95m 0.7081  [0m | [95m 3.144   [0m | [95m 119.4   [0m |
| [0m 4       [0m | [0m-1.112   [0m | [0m 0.8324  [0m | [0m 0.2123  [0m | [0m 4.273   [0m | [0m 103.7   [0m |
| [95m 5       [0m | [95m-0.1432  [0m | [95m 0.3042  [0m | [95m 0.5248  [0m | [95m 6.024   [0m | [95m 105.8   [0m |
| [0m 6       [0m | [0m-2.644   [0m | [0m 0.6119  [0m | [0m 0.1395  [0m | [0m 5.045   [0m | [0m 107.3   [0m |
| [0m 7       [0m | [0m-0.149   [0m | [0m 0.4561  [0m | [0m 0.7852  [0m | [0m 4.398   [0m | [0m 1

In [44]:
params = xgb_bo.max['params']
print(params)

{'gamma': 0.3042422429595377, 'learning_rate': 0.5247564316322378, 'max_depth': 6.023615130494811, 'n_estimators': 105.82458280396084}


In [45]:
params['max_depth']= int(params['max_depth'])
params['n_estimators']= int(params['n_estimators'])

In [46]:
boost = xgb.XGBRegressor(**params).fit(X,y)
predicts = boost.predict(X)

In [47]:
r2_score(y,predicts)

0.8766893502385911