In [1]:
import pandas as pd
import seaborn as sns
import time
import os
import xgboost
import numpy as np
import matplotlib.pyplot as plt
import xgboost
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.neighbors import NearestNeighbors

from src.modelling import train_model

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

sns.set_theme(style="whitegrid")
sns.set_palette(palette="Paired")
SEED: int = 42

# Load Data

In [2]:
X_train = pd.read_csv("../../data/processed/train_data_cleaned.csv", index_col="building_id")
y_train = pd.read_csv("../../data/processed/train_labels.csv", index_col="building_id", usecols=["building_id", "damage_grade"])

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")

X_train.head(5)

Shape of X_train: (260601, 18)
Shape of y_train: (260601, 1)


Unnamed: 0_level_0,count_families,count_floors_pre_eq,geo_level_1_id,geo_level_2_id,geo_level_3_id,has_secondary_use,has_superstructure_cement_mortar_brick,has_superstructure_mud_mortar_stone,age,area_percentage,superstructure_quality,foundation_type_r,ground_floor_type_f,ground_floor_type_v,land_surface_condition_t,other_floor_type_q,position_s,roof_type_n
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
802906,0.111111,0.125,0.2,0.341275,0.970637,0.0,0.0,1.0,0.121343,0.247589,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
28830,0.111111,0.125,0.266667,0.630694,0.223761,0.0,0.0,1.0,0.040448,0.346625,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
94947,0.111111,0.125,0.7,0.25438,0.714013,0.0,0.0,1.0,0.040448,0.198071,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
590882,0.111111,0.125,0.733333,0.292922,0.850959,0.0,0.0,1.0,0.040448,0.247589,0.5,1.0,1.0,0.0,1.0,0.0,1.0,1.0
201944,0.111111,0.25,0.366667,0.091801,0.118405,0.0,0.0,0.0,0.121343,0.346625,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0


In [3]:
X_train = X_train.iloc[:100000]
y_train = y_train.iloc[:100000]

# Split Data into Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, stratify=y_train, test_size=0.2)

In [40]:
def get_prediction_score(model, X_test, y_test, rename=True):
    # Predict on unseen data
    y_pred = pd.DataFrame(model.predict(X_test))

    # Since XGBoost predicts [0, 1, 2] classes, we have to transform it to original class labels
    if isinstance(model, xgboost.XGBClassifier) and rename:
        print("XGBoost model was used. Renaming predictions ...")
        y_pred = y_pred.replace({0: 1, 1: 2, 2: 3})

    acc = accuracy_score(y_true=y_test, y_pred=y_pred)
    mcc = matthews_corrcoef(y_true=y_test, y_pred=y_pred)
    print(f"ACC on test set: {acc:.4f}")
    print(f"MCC on test set: {mcc:.4f}")

    return acc, mcc

# Train Baseline Model

In [6]:
# Baseline Model
model_base, cv_results_model_base = train_model(model="XGBoost",
                                                train_data=X_train,
                                                train_labels=y_train,
                                                scoring=["accuracy", "matthews_corrcoef"])

# Full Data Set Results:
# CV Test ACC: 0.7334 +/- 0.0022
# CV Test MCC: 0.5044 +/- 0.0039
# ACC on hold-out-set / test set: 0.7413
# MCC on hold-out-set / test set: 0.5202

# Results on first 100k Samples:
# CV Test ACC: 0.7166 +/- 0.0023
# CV Test MCC: 0.4714 +/- 0.0049
# ACC on hold-out-set / test set: 0.7272
# MCC on hold-out-set / test set: 0.4917

acc_base, mcc_base = get_prediction_score(model_base, X_test, y_test)

Fitting XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=20, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              predictor=None, random_state=42, ...) ...

CV Training ACC: 0.9829 +/- 0.0006 
CV Test ACC: 0.7166 +/- 0.0023

CV Training MCC: 0.9692 +/- 0.001 
CV Test MCC: 0.4714 +/- 0.0049

XGBoost model was used. Renaming predictions ...
ACC on test set: 0.7272
MCC on test set: 0.4917


# Apply Bayesian Optimization

In [7]:
# XGBoost expects [0, 1, 2] class labels instead of [1, 2, 3]
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train["damage_grade"].to_numpy())
y_test = encoder.transform(y_test["damage_grade"].to_numpy())

In [10]:
# Define base model to optimize
model_xgb = xgboost.XGBClassifier(random_state=42, n_jobs = -1)

# Define search space
#param_grid_xgb = {"max_depth": (1, 25),
#                  "gamma": (1,9),
#                  "n_estimators": (50, 250),
#                  "eta": (0.1, 0.5)
#                  }

param_grid_xgb = {'learning_rate': Real(0.01, 0.7, 'uniform'),
                 'max_depth': Integer(3, 25),
                 'subsample': Real(0.1, 1.0, 'uniform'),
                 'colsample_bytree': Real(0.1, 1.0, 'uniform'), # subsample ratio of columns by tree
                 'reg_lambda': Real(1e-9, 20., 'uniform'), # L2 regularization
                 'reg_alpha': Real(1e-9, 20., 'uniform'), # L1 regularization
                 'n_estimators': Integer(50, 300)
   }

start_time = time.time()
bayes_opt = BayesSearchCV(estimator=model_xgb,
                          search_spaces=param_grid_xgb,
                          n_jobs=-1, cv=5, random_state=42,
                          scoring="matthews_corrcoef",
                          return_train_score=True, verbose=1, refit="matthews_corrcoef")
bayes_opt.fit(X_train, y_train)
print(f"Took {time.time() - start_time} seconds")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [12]:
#Best Params: OrderedDict([('colsample_bytree', 0.27972729119255346), ('learning_rate', 0.1228007619140701), ('max_depth', 23), ('n_estimators', 144), ('reg_alpha', 1e-09), ('reg_lambda', 18.935672936151313), ('subsample', 1.0)])

print(f"Best Score: {bayes_opt.best_score_}")
print(f"Best Params: {bayes_opt.best_params_}")
print(80 * "=")

bayes_opt_cv_results = pd.DataFrame(bayes_opt.cv_results_)
bayes_opt_cv_results

Best Score: 0.49694028922869327
Best Params: OrderedDict([('colsample_bytree', 0.27972729119255346), ('learning_rate', 0.1228007619140701), ('max_depth', 23), ('n_estimators', 144), ('reg_alpha', 1e-09), ('reg_lambda', 18.935672936151313), ('subsample', 1.0)])


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_reg_alpha,param_reg_lambda,...,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score,rank_train_score
0,50.983516,7.824798,0.571137,0.092475,0.469094,0.512131,24,129,13.402959,8.282373,...,0.007432,28,0.535771,0.536246,0.535555,0.532689,0.53758,0.535568,0.001602,33
1,119.522313,23.283821,0.788363,0.012534,0.85365,0.619488,10,288,17.282558,1.246259,...,0.005155,33,0.558645,0.554113,0.556599,0.553951,0.556107,0.555883,0.001735,31
2,27.255652,3.495119,0.307238,0.090248,0.500349,0.643919,5,158,3.759104,9.073307,...,0.007366,36,0.519217,0.516999,0.516026,0.511436,0.517326,0.516201,0.002598,36
3,218.760847,41.164213,1.408336,0.135072,0.831156,0.128591,16,251,10.461047,1.909101,...,0.005797,11,0.646569,0.645144,0.641509,0.643441,0.643313,0.643995,0.001726,23
4,151.428414,26.32403,0.841006,0.064741,0.819598,0.31224,15,229,18.077041,14.340623,...,0.00637,25,0.561976,0.560115,0.557476,0.560575,0.560969,0.560222,0.001504,30
5,36.021952,5.616835,0.197937,0.026889,0.760625,0.658165,7,97,16.010065,7.451596,...,0.007164,32,0.52327,0.520648,0.519688,0.520946,0.520219,0.520954,0.001233,34
6,137.84719,23.364619,0.943583,0.080329,0.655371,0.545001,11,269,11.852394,11.892401,...,0.004162,16,0.663281,0.662938,0.662161,0.664216,0.665166,0.663552,0.001042,22
7,154.864998,29.732869,1.139848,0.104232,0.589063,0.644806,14,259,6.4812,1.022691,...,0.004515,23,0.806349,0.807811,0.806181,0.805749,0.808487,0.806915,0.001048,15
8,67.130025,11.122218,0.537856,0.043183,0.95993,0.493429,22,154,7.621295,4.228286,...,0.006845,45,0.543688,0.5398,0.54037,0.543302,0.545825,0.542597,0.002231,32
9,6.24253,0.931464,0.098183,0.008804,0.103267,0.574303,19,114,11.573287,5.494297,...,0.005969,46,0.426517,0.430655,0.423831,0.426245,0.427025,0.426855,0.002195,48


Best Score: 0.49694028922869327
Best Params: OrderedDict([('colsample_bytree', 0.27972729119255346), ('learning_rate', 0.1228007619140701), ('max_depth', 23), ('n_estimators', 144), ('reg_alpha', 1e-09), ('reg_lambda', 18.935672936151313), ('subsample', 1.0)])


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_reg_alpha,param_reg_lambda,...,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score,rank_train_score
0,50.983516,7.824798,0.571137,0.092475,0.469094,0.512131,24,129,13.402959,8.282373,...,0.007432,28,0.535771,0.536246,0.535555,0.532689,0.53758,0.535568,0.001602,33
1,119.522313,23.283821,0.788363,0.012534,0.85365,0.619488,10,288,17.282558,1.246259,...,0.005155,33,0.558645,0.554113,0.556599,0.553951,0.556107,0.555883,0.001735,31
2,27.255652,3.495119,0.307238,0.090248,0.500349,0.643919,5,158,3.759104,9.073307,...,0.007366,36,0.519217,0.516999,0.516026,0.511436,0.517326,0.516201,0.002598,36
3,218.760847,41.164213,1.408336,0.135072,0.831156,0.128591,16,251,10.461047,1.909101,...,0.005797,11,0.646569,0.645144,0.641509,0.643441,0.643313,0.643995,0.001726,23
4,151.428414,26.32403,0.841006,0.064741,0.819598,0.31224,15,229,18.077041,14.340623,...,0.00637,25,0.561976,0.560115,0.557476,0.560575,0.560969,0.560222,0.001504,30
5,36.021952,5.616835,0.197937,0.026889,0.760625,0.658165,7,97,16.010065,7.451596,...,0.007164,32,0.52327,0.520648,0.519688,0.520946,0.520219,0.520954,0.001233,34
6,137.84719,23.364619,0.943583,0.080329,0.655371,0.545001,11,269,11.852394,11.892401,...,0.004162,16,0.663281,0.662938,0.662161,0.664216,0.665166,0.663552,0.001042,22
7,154.864998,29.732869,1.139848,0.104232,0.589063,0.644806,14,259,6.4812,1.022691,...,0.004515,23,0.806349,0.807811,0.806181,0.805749,0.808487,0.806915,0.001048,15
8,67.130025,11.122218,0.537856,0.043183,0.95993,0.493429,22,154,7.621295,4.228286,...,0.006845,45,0.543688,0.5398,0.54037,0.543302,0.545825,0.542597,0.002231,32
9,6.24253,0.931464,0.098183,0.008804,0.103267,0.574303,19,114,11.573287,5.494297,...,0.005969,46,0.426517,0.430655,0.423831,0.426245,0.427025,0.426855,0.002195,48


# Test best Params on full data

In [28]:
bayes_opt_cv_results.sort_values(by="rank_test_score", ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_reg_alpha,param_reg_lambda,...,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score,rank_train_score
47,65.564004,12.172925,1.157735,0.134294,0.279727,0.122801,23,144,0.0,18.935673,...,0.004985,1,0.685643,0.68068,0.681124,0.681121,0.682675,0.682248,0.001828,21
45,55.914233,9.184229,1.205553,0.119831,0.274247,0.17415,24,158,0.0,2.3608,...,0.00605,2,0.816477,0.820125,0.818764,0.815041,0.812386,0.816558,0.002732,14
49,87.399013,15.962905,1.128523,0.073544,0.352379,0.152314,25,157,3.215634,5.745962,...,0.008364,3,0.705319,0.704107,0.702457,0.706089,0.699884,0.703571,0.002215,18
39,45.595621,10.244614,0.754908,0.054135,0.197118,0.166628,23,160,3.066039,7.412461,...,0.007263,4,0.582617,0.583009,0.578653,0.582105,0.58269,0.581815,0.001607,27
43,66.639861,13.89149,0.877606,0.092823,0.407759,0.169536,13,157,0.0,4.717433,...,0.004301,5,0.781888,0.78119,0.781145,0.78253,0.783638,0.782078,0.000931,16
23,115.873264,22.388497,1.337198,0.103635,0.551614,0.215524,16,181,3.138283,0.722296,...,0.004971,6,0.835803,0.834569,0.835812,0.839009,0.833962,0.835831,0.001743,13
46,132.757759,26.154966,2.094915,0.582019,0.704504,0.133845,13,203,0.0,4.763347,...,0.003614,7,0.844871,0.847308,0.844161,0.841642,0.847513,0.845099,0.002172,11
26,159.964483,27.061746,2.722053,0.296424,0.496448,0.183572,23,197,0.0,16.399958,...,0.003701,8,0.916833,0.914263,0.915332,0.920273,0.91844,0.917028,0.002149,9
25,110.767365,22.318902,0.960787,0.127063,0.756261,0.124444,14,145,4.462944,6.598466,...,0.004353,9,0.691195,0.693655,0.690913,0.690843,0.687755,0.690872,0.001874,19
18,177.687882,29.312316,1.176351,0.106657,0.861563,0.160676,13,244,8.935649,1.0684,...,0.005721,10,0.688855,0.688025,0.68549,0.686485,0.688609,0.687493,0.001297,20


In [29]:
best_params = bayes_opt.best_params_

xgb_optimized = xgboost.XGBClassifier(**best_params, random_state=42, n_jobs=-1)

In [None]:
xgb_optimized.fit(X_train, y_train)

In [41]:
acc, mcc = get_prediction_score(xgb_optimized, X_test, y_test, rename=False)

ACC on test set: 0.7401
MCC on test set: 0.5134


In [43]:
dict(best_params)

{'colsample_bytree': 0.27972729119255346,
 'learning_rate': 0.1228007619140701,
 'max_depth': 23,
 'n_estimators': 144,
 'reg_alpha': 1e-09,
 'reg_lambda': 18.935672936151313,
 'subsample': 1.0}