# Import packages and data

In [2]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
    AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelEncoder
from category_encoders import OrdinalEncoder, OneHotEncoder, TargetEncoder

from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
import shap

PREPROCESSED_FILE = '../data/preprocessed_train_val_Mar13_0130pm_label_enc.csv'

df = pd.read_csv(PREPROCESSED_FILE, index_col='Reservation-id')

# RandomForest
# Extra Trees
# SVC
# LightGBM

df = df.sample(frac=1)

X_train = df.drop(columns='Reservation_Status')
Y_train = df['Reservation_Status']




In [15]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)
random_state = 2

# LGBM HyperParameter

In [20]:
LGBM = LGBMClassifier()
lgbm_param_grid = {
              'n_estimators' : [800,1000,1200],
              'learning_rate': [0.1],
              'max_depth': [9, 10],
              'min_child_samples': [200, 300],
              'num_leaves': [16, 24] ,
              'class_weight':['balanced']
              }

gsLGBM = GridSearchCV(LGBM,param_grid = lgbm_param_grid, 
                      cv=kfold, 
                      scoring=make_scorer(f1_score, average='macro'), 
                      n_jobs= 8, 
                      verbose = 1)

gsLGBM.fit(X_train,Y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
             estimator=LGBMClassifier(), n_jobs=8,
             param_grid={'class_weight': ['balanced'], 'learning_rate': [0.1],
                         'max_depth': [9, 10], 'min_child_samples': [200, 300],
                         'n_estimators': [800, 1000, 1200],
                         'num_leaves': [16, 24]},
             scoring=make_scorer(f1_score, average=macro), verbose=1)

In [16]:
model = LGBMClassifier(class_weight='balanced', learning_rate=0.1, max_depth=10, min_child_samples=100, n_estimators=800, num_leaves=16, random_state=2)

cv_res = cross_val_score(model,
                      X_train, y=Y_train, 
                      scoring = make_scorer(f1_score, average='macro'), 
                      cv = kfold, n_jobs=-1)

In [22]:
print(cv_res.mean())

0.3886562505045156


In [21]:
gsLGBM.best_score_

0.39248526818195095

In [23]:
gsLGBM.best_params_

{'class_weight': 'balanced',
 'learning_rate': 0.1,
 'max_depth': 9,
 'min_child_samples': 200,
 'n_estimators': 800,
 'num_leaves': 24}

In [25]:
best_param = pd.DataFrame(gsLGBM.cv_results_)

In [27]:
best_param[best_param.param_n_estimators==800]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_learning_rate,param_max_depth,param_min_child_samples,param_n_estimators,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,22.348088,2.254007,5.474494,0.625532,balanced,0.1,9,200,800,16,"{'class_weight': 'balanced', 'learning_rate': ...",0.388932,0.392231,0.380957,0.399333,0.381451,0.388581,0.006899,12
1,27.174212,2.494568,5.740676,1.412359,balanced,0.1,9,200,800,24,"{'class_weight': 'balanced', 'learning_rate': ...",0.386974,0.403417,0.386342,0.401107,0.384586,0.392485,0.008054,1
6,25.048284,0.97952,3.988752,1.30634,balanced,0.1,9,300,800,16,"{'class_weight': 'balanced', 'learning_rate': ...",0.387924,0.388828,0.387618,0.387744,0.384227,0.387268,0.001578,20
7,31.125726,2.032388,6.737379,1.29347,balanced,0.1,9,300,800,24,"{'class_weight': 'balanced', 'learning_rate': ...",0.389732,0.393074,0.385135,0.396164,0.383811,0.389583,0.004661,8
12,24.242417,1.332003,2.741201,0.352964,balanced,0.1,10,200,800,16,"{'class_weight': 'balanced', 'learning_rate': ...",0.386933,0.395794,0.383713,0.396691,0.381685,0.388963,0.006181,9
13,26.908103,0.658354,3.934023,0.805634,balanced,0.1,10,200,800,24,"{'class_weight': 'balanced', 'learning_rate': ...",0.384783,0.397315,0.39023,0.394305,0.387409,0.390809,0.004533,2
18,24.377506,0.799697,4.675276,2.721286,balanced,0.1,10,300,800,16,"{'class_weight': 'balanced', 'learning_rate': ...",0.391353,0.396674,0.380645,0.396267,0.38303,0.389594,0.006647,7
19,36.857818,2.33954,7.952104,0.902962,balanced,0.1,10,300,800,24,"{'class_weight': 'balanced', 'learning_rate': ...",0.387718,0.394352,0.394382,0.396544,0.379473,0.390494,0.006257,3


# CatBoost HyperParameter

In [None]:
CB = CatBoostClassifier(auto_class_weights='Balanced')
cb_param_grid = {
              'n_estimators' : [400,600,800],
              'learning_rate': [0.1],
              'max_depth': [4, 8],
              'min_child_samples': [100,200],
              'num_leaves': [8, 16] ,
              }

gsCB = GridSearchCV(CB,param_grid = lgbm_param_grid, 
                      cv=kfold, 
                      scoring=make_scorer(f1_score, average='macro'), 
                      n_jobs= 8, 
                      verbose = 1)

gsLGBM.fit(X_train,Y_train)