# Import packages and data

In [45]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
    AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelEncoder
from category_encoders import OrdinalEncoder, OneHotEncoder, TargetEncoder

from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
import shap

PREPROCESSED_FILE = '../data/preprocessed_train_val_Mar13_0130pm_label_enc.csv'

df = pd.read_csv(PREPROCESSED_FILE, index_col='Reservation-id')

# RandomForest
# Extra Trees
# SVC
# LightGBM

df = df.sample(frac=1)

X_train = df.drop(columns='Reservation_Status')
Y_train = df['Reservation_Status']

kfold = StratifiedKFold(n_splits=5, shuffle=True)

random_state = 2


In [46]:
# Modeling step Test differents algorithms 
classifiers = []
classifiers.append(CatBoostClassifier(max_depth=10, auto_class_weights='Balanced', n_estimators=10))
classifiers.append(LGBMClassifier(random_state=random_state, max_depth=10, class_weight='balanced'))
classifiers.append(SVC(random_state=random_state, class_weight='balanced'))
classifiers.append(DecisionTreeClassifier(random_state=random_state, max_depth=10, class_weight='balanced'))
# classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
# classifiers.append(RandomForestClassifier(random_state=random_state))
# classifiers.append(ExtraTreesClassifier(random_state=random_state))
# classifiers.append(GradientBoostingClassifier(random_state=random_state))
# classifiers.append(MLPClassifier(random_state=random_state))
# classifiers.append(KNeighborsClassifier())
# classifiers.append(LogisticRegression(random_state=random_state))
# classifiers.append(LinearDiscriminantAnalysis())
# classifiers.append(XGBClassifier(random_state=random_state))

cv_results = []
for classifier in classifiers :
    print(classifier)
    cv_results.append(cross_val_score(classifier,
                                      X_train, y = Y_train, 
                                      scoring = make_scorer(f1_score, average='macro'), 
                                      cv = kfold, n_jobs=-1))

<catboost.core.CatBoostClassifier object at 0x00000203169D0470>
LGBMClassifier(class_weight='balanced', max_depth=10, random_state=2)
SVC(class_weight='balanced', random_state=2)
DecisionTreeClassifier(class_weight='balanced', max_depth=10, random_state=2)


In [47]:
[x.mean() for x in cv_results]

[0.35051249363101056,
 0.3883923540305332,
 0.16915370866021967,
 0.34192403320489706]

In [14]:
[x.std() for x in cv_results]

[0.02923751581925276,
 0.031138680758617115,
 2.1614624256138803e-05,
 0.035678579965796424]

In [3]:
from sklearn.preprocessing import LabelEncoder,StandardScaler

ss = StandardScaler()
X_train = ss.fit_transform(X_train)

In [6]:
X_train

array([[-0.2613606 , -0.28358421,  0.3550371 , ...,  0.57383819,
         0.70870286,  0.63336308],
       [ 0.32708497,  0.56579247,  1.73966347, ..., -1.74265154,
         0.70870286,  0.63336308],
       [-0.13059492,  0.56579247,  1.73966347, ..., -1.74265154,
         0.70870286,  0.63336308],
       ...,
       [-1.6997831 , -0.28358421,  1.73966347, ...,  0.57383819,
         0.70870286, -1.57887322],
       [-0.13059492,  2.26454583, -1.02958927, ...,  0.57383819,
        -1.41102859,  0.63336308],
       [ 0.58861633,  1.41516915, -1.02958927, ...,  0.57383819,
         0.70870286, -1.57887322]])

In [11]:
cb = CatBoostClassifier(max_depth=10, auto_class_weights='Balanced', n_estimators=10)
# svc = SVC(class_weight='balanced')
knn = KNeighborsClassifier()
et = ExtraTreesClassifier(max_depth=10, n_estimators=10, class_weight='balanced')
ab = AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state, max_depth=10, 
                                               class_weight='balanced'),
                        random_state=random_state,learning_rate=0.1)
mlp = MLPClassifier()

In [32]:
model = VotingClassifier([('catboost', cb),
                          ('SVC', svc), 
                          ('KNN', knn), 
                          ('Extra Trees', et), 
                          ('AdaBoost', ab),
                          ('MLP', mlp)], n_jobs=-1, verbose=True)

In [41]:
model = LGBMClassifier(n_estimators=5000, n_jobs=-1, max_depth=3, num_leaves=7
                      ,random_state=2, class_weight='balanced', min_child_samples=100, learning_rate=0.01)

In [42]:
cv_score = cross_val_score(model,
                          X_train, y = Y_train, 
                          scoring = make_scorer(f1_score, average='macro'), 
                          cv = kfold, n_jobs=-1)

In [43]:
cv_score

array([0.37419469, 0.37808239, 0.37826384, 0.37442648, 0.37471291])

In [None]:
model.fit(X_train, Y_train)

In [44]:
cv_score.mean()

0.3759360618938099