# Import packages and data

In [132]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
    AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelEncoder
from category_encoders import OrdinalEncoder, OneHotEncoder, TargetEncoder

from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
import shap

PREPROCESSED_FILE = '../data/preprocessed_train_val_Mar13_0130pm_label_enc.csv'
SUBMISSION_SET = '../data/submission_preprocessed_train_val_Mar13_0130pm_label_enc.csv'

df = pd.read_csv(PREPROCESSED_FILE, index_col='Reservation-id')
df_sub = pd.read_csv(SUBMISSION_SET, index_col='Reservation-id')
# RandomForest
# Extra Trees
# SVC
# LightGBM

df = df.sample(frac=1)
# df = df.sample(1000)
X_train = df.drop(columns='Reservation_Status')
Y_train = df['Reservation_Status']

In [133]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)
random_state = 2

# Voting Classifier

In [134]:
classes=[1,2,3]
cl_weights = compute_class_weight('balanced', classes=classes, y=Y_train)
weights = Y_train.map(dict(zip(classes, cl_weights))).values

In [78]:
lgbm = LGBMClassifier(learning_rate=0.1, max_depth=10, min_child_samples=100, n_estimators=800, num_leaves=16, random_state=2, class_weight='balanced')
etc = ExtraTreesClassifier(max_depth=10, class_weight='balanced')
rf = RandomForestClassifier(bootstrap=False, criterion='gini', max_depth=None, max_features=10, min_samples_leaf=10, min_samples_split=2, n_estimators=300, class_weight='balanced')
xgb = XGBClassifier(learning_rate=0.2, max_delta_step=0, max_depth=4, min_child_weight=1, n_estimators=800)
dt = DecisionTreeClassifier(criterion='gini', max_depth=4, class_weight='balanced')
cb = CatBoostClassifier(learning_rate= 0.1, max_depth= 8, min_child_samples= 100, n_estimators= 800, num_leaves= 16, grow_policy="Lossguide", auto_class_weights='Balanced')
ab = AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state, max_depth=5, class_weight='balanced'), random_state=random_state, learning_rate=0.1, n_estimators=200)

In [79]:
voting_class = VotingClassifier([('lgbm', lgbm), ('etc', etc), ('rf', rf), ('dt', dt), ('ab', ab), ('cb', cb)], n_jobs=-1, voting='soft')

In [80]:
cv_res = cross_val_score(voting_class,
              X_train, y=Y_train, 
              scoring = make_scorer(f1_score, average='macro'), 
              cv = kfold, n_jobs=4)

In [81]:
cv_res

array([0.38839767, 0.38952655, 0.40192455, 0.39599607, 0.39886488])

In [82]:
print(cv_res.mean())

0.3949419447500224


In [77]:
print(cv_res.min())

0.38503242074145105


In [65]:
cv_res

array([0.39766923, 0.39944238, 0.40903132, 0.39069876, 0.39487234])

# Voting Classifier with XGB

In [136]:
classes=[1,2,3]
cl_weights = compute_class_weight('balanced', classes=classes, y=Y_train)
weights = Y_train.map(dict(zip(classes, cl_weights))).values

In [145]:
lgbm = LGBMClassifier(learning_rate=0.1, max_depth=10, min_child_samples=100, n_estimators=800, num_leaves=16, random_state=2)
etc = ExtraTreesClassifier(max_depth=10)
rf = RandomForestClassifier(bootstrap=False, criterion='gini', max_depth=None, max_features=10, min_samples_leaf=10, min_samples_split=2, n_estimators=300)
xgb = XGBClassifier(learning_rate=0.2, max_delta_step=0, max_depth=4, min_child_weight=1, n_estimators=800)
dt = DecisionTreeClassifier(criterion='gini', max_depth=4)
cb = CatBoostClassifier(learning_rate= 0.1, max_depth= 8, min_child_samples= 100, n_estimators= 800, num_leaves= 16, grow_policy="Lossguide")
ab = AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state, max_depth=5), random_state=random_state, learning_rate=0.1, n_estimators=200)

In [146]:
voting_class = VotingClassifier([('lgbm', lgbm), ('etc', etc), ('rf', rf), ('dt', dt), ('ab', ab), ('cb', cb), ('xgb', xgb)], n_jobs=-1, voting='soft')

In [101]:
cv_res = cross_val_score(voting_class,
              X_train, y=Y_train, 
              scoring = make_scorer(f1_score, average='macro'), 
              cv = kfold, n_jobs=4, fit_params={'sample_weight':weights})

In [139]:
weights

array([0.44125456, 0.44125456, 0.44125456, ..., 0.44125456, 0.44125456,
       0.44125456])

In [140]:
voting_class.fit(X_train, Y_train, sample_weight=weights)

VotingClassifier(estimators=[('lgbm',
                              LGBMClassifier(max_depth=10,
                                             min_child_samples=100,
                                             n_estimators=800, num_leaves=16,
                                             random_state=2)),
                             ('etc', ExtraTreesClassifier(max_depth=10)),
                             ('rf',
                              RandomForestClassifier(bootstrap=False,
                                                     max_features=10,
                                                     min_samples_leaf=10,
                                                     n_estimators=300)),
                             ('dt', DecisionTreeClassifier(max_depth=4)),
                             ('ab',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5,
                                                                                       

In [102]:
print(cv_res)

[0.39384897 0.39478015 0.3984265  0.4007296  0.39807413]


In [103]:
print(cv_res.mean())

0.39717186950121397


In [123]:
df_sub.columns

Index(['Gender', 'Age', 'Ethnicity', 'Educational_Level', 'Income',
       'Country_region', 'Hotel_Type', 'Adults', 'Children', 'Babies',
       'Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking',
       'Use_Promotion', 'Discount_Rate', 'Room_Rate', 'stay_length',
       'income_amount', 'num_rooms', 'total_cost', 'income_ratio',
       'adults_to_child', 'adults_to_babies', 'total_people', 'checkin_day',
       'checkin_week', 'checkin_dayofweek', 'checkin_month', 'checkin_year',
       'days_to_booking', 'days_to_stay_ratio', 'Reservation_Status'],
      dtype='object')

In [141]:
sub_y = voting_class.predict(df_sub)

In [142]:
sub_y

array([1, 3, 1, ..., 1, 1, 2], dtype=int64)

In [143]:
df_sub['Reservation_Status'] = sub_y

In [127]:
df_sub

Unnamed: 0_level_0,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Adults,Children,Babies,...,adults_to_babies,total_people,checkin_day,checkin_week,checkin_dayofweek,checkin_month,checkin_year,days_to_booking,days_to_stay_ratio,Reservation_Status
Reservation-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62931593,1,52,1,1,4,3,1,3,3,0,...,0.000000,6,18,46,4,11,2016,21,21.000000,1
70586099,1,47,1,1,4,2,2,2,1,0,...,0.000000,3,18,46,4,11,2016,104,104.000000,1
4230648,1,28,4,1,1,2,1,2,2,0,...,0.000000,4,28,17,4,4,2017,20,6.666667,1
25192322,1,65,2,4,4,3,2,1,3,2,...,2.000000,6,18,46,4,11,2016,182,91.000000,1
80931528,2,45,3,3,4,3,1,3,1,0,...,0.000000,4,18,46,4,11,2016,18,9.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39110574,2,53,1,3,2,3,3,3,3,1,...,0.333333,7,29,13,2,3,2017,199,199.000000,1
5496554,1,63,2,3,2,3,3,3,3,0,...,0.000000,6,29,13,2,3,2017,47,15.666667,1
59004046,1,59,2,1,1,2,2,2,3,0,...,0.000000,5,29,13,2,3,2017,215,215.000000,1
65838682,2,43,3,3,4,4,3,4,2,1,...,0.250000,7,29,13,2,3,2017,25,25.000000,1


In [144]:
df_sub.Reservation_Status.value_counts()#.to_csv('../output/submission_csv_13Mar_0634pm.csv')SUBMISSION_SET

1    3822
3     326
2     170
Name: Reservation_Status, dtype: int64