# Import packages and data

In [24]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, PredefinedSplit

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
    AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelEncoder
from category_encoders import OrdinalEncoder, OneHotEncoder, TargetEncoder

from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
import shap
import itertools as it


PREPROCESSED_TRAIN_FILE = '../data/preprocessed_train_only_Mar13_0700pm_label_enc.csv'
PREPROCESSED_VAL_FILE = '../data/preprocessed_val_only_Mar13_0700pm_label_enc.csv'

SUBMISSION_SET = '../data/submission_preprocessed_train_val_Mar13_0130pm_label_enc.csv'

df = pd.read_csv(PREPROCESSED_TRAIN_FILE, index_col='Reservation-id')
df_val = pd.read_csv(PREPROCESSED_VAL_FILE, index_col='Reservation-id')
df_sub = pd.read_csv(SUBMISSION_SET, index_col='Reservation-id')
# RandomForest
# Extra Trees
# SVC
# LightGBM

df = df.sample(frac=1)
# df = df.sample(1000)
X_train = df.drop(columns='Reservation_Status')
Y_train = df['Reservation_Status']

X_val = df_val.drop(columns='Reservation_Status')
Y_val = df_val['Reservation_Status']

In [10]:
df_train_val = pd.concat([df, df_val], axis=0)

In [35]:
X_train_val = df_train_val.drop(columns='Reservation_Status')
Y_train_val = df_train_val['Reservation_Status']

In [30]:
train_indices = [-1 for i in range(df.shape[0])]
val_indices = [0 for i in range(df_val.shape[0])]
split = train_indices + val_indices
pr_split = PredefinedSplit(test_fold=np.array(split))

# LGBM

In [None]:
LGBM = LGBMClassifier()
lgbm_param_grid = {
              'n_estimators' : [800,1200,1600],
              'learning_rate': [0.1, 0.01],
              'max_depth': [4, 6, 8, 10],
              'min_child_samples': [100, 200, 300],
              'num_leaves': [7, 16, 24] ,
              'class_weight':['balanced']
              }

gsLGBM = GridSearchCV(LGBM,param_grid = lgbm_param_grid, 
                      cv=pr_split, 
                      scoring=make_scorer(f1_score, average='macro'), 
                      n_jobs= 8, 
                      verbose = 1)

gsLGBM.fit(X_train_val,Y_train_val)

Fitting 1 folds for each of 216 candidates, totalling 216 fits


In [39]:
gsLGBM.best_params_

{'class_weight': 'balanced',
 'learning_rate': 0.1,
 'max_depth': 10,
 'min_child_samples': 200,
 'n_estimators': 800,
 'num_leaves': 16}

In [6]:
lgbm = LGBMClassifier(learning_rate=0.1, max_depth=10, min_child_samples=100, n_estimators=800, num_leaves=16, random_state=2)


In [None]:
allNames = sorted(my_dict)
combinations = it.product(*(my_dict[Name] for Name in allNames))

In [9]:
for i in it.product(*lgbm_param_grid.values()):
    print(i)

(800, 0.1, 9, 200, 16, 'balanced')
(800, 0.1, 9, 200, 24, 'balanced')
(800, 0.1, 9, 300, 16, 'balanced')
(800, 0.1, 9, 300, 24, 'balanced')
(800, 0.1, 10, 200, 16, 'balanced')
(800, 0.1, 10, 200, 24, 'balanced')
(800, 0.1, 10, 300, 16, 'balanced')
(800, 0.1, 10, 300, 24, 'balanced')
(1000, 0.1, 9, 200, 16, 'balanced')
(1000, 0.1, 9, 200, 24, 'balanced')
(1000, 0.1, 9, 300, 16, 'balanced')
(1000, 0.1, 9, 300, 24, 'balanced')
(1000, 0.1, 10, 200, 16, 'balanced')
(1000, 0.1, 10, 200, 24, 'balanced')
(1000, 0.1, 10, 300, 16, 'balanced')
(1000, 0.1, 10, 300, 24, 'balanced')
(1200, 0.1, 9, 200, 16, 'balanced')
(1200, 0.1, 9, 200, 24, 'balanced')
(1200, 0.1, 9, 300, 16, 'balanced')
(1200, 0.1, 9, 300, 24, 'balanced')
(1200, 0.1, 10, 200, 16, 'balanced')
(1200, 0.1, 10, 200, 24, 'balanced')
(1200, 0.1, 10, 300, 16, 'balanced')
(1200, 0.1, 10, 300, 24, 'balanced')


In [6]:
classes=[1,2,3]
cl_weights = compute_class_weight('balanced', classes=classes, y=Y_train)
weights = Y_train.map(dict(zip(classes, cl_weights))).values

In [7]:
lgbm = LGBMClassifier(learning_rate=0.1, max_depth=10, min_child_samples=100, n_estimators=800, num_leaves=16, random_state=2)
etc = ExtraTreesClassifier(max_depth=10)
rf = RandomForestClassifier(bootstrap=False, criterion='gini', max_depth=None, max_features=10, min_samples_leaf=10, min_samples_split=2, n_estimators=300)
xgb = XGBClassifier(learning_rate=0.2, max_delta_step=0, max_depth=4, min_child_weight=1, n_estimators=800)
dt = DecisionTreeClassifier(criterion='gini', max_depth=4)
cb = CatBoostClassifier(learning_rate= 0.1, max_depth= 8, min_child_samples= 100, n_estimators= 800, num_leaves= 16, grow_policy="Lossguide")
ab = AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state, max_depth=5), random_state=random_state, learning_rate=0.1, n_estimators=200)

In [8]:
voting_class = VotingClassifier([('lgbm', lgbm), ('etc', etc), ('rf', rf), ('dt', dt), ('ab', ab), ('cb', cb), ('xgb', xgb)], n_jobs=-1, voting='soft')

In [20]:
lgbm.fit(X_train, Y_train, sample_weight=weights)

LGBMClassifier(max_depth=10, min_child_samples=100, n_estimators=800,
               num_leaves=16, random_state=2)

In [21]:
pred_y = lgbm.predict(X_val)

In [22]:
f1_score(Y_val, pred_y, average='macro')

0.31215140664370916

In [123]:
df_sub.columns

Index(['Gender', 'Age', 'Ethnicity', 'Educational_Level', 'Income',
       'Country_region', 'Hotel_Type', 'Adults', 'Children', 'Babies',
       'Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking',
       'Use_Promotion', 'Discount_Rate', 'Room_Rate', 'stay_length',
       'income_amount', 'num_rooms', 'total_cost', 'income_ratio',
       'adults_to_child', 'adults_to_babies', 'total_people', 'checkin_day',
       'checkin_week', 'checkin_dayofweek', 'checkin_month', 'checkin_year',
       'days_to_booking', 'days_to_stay_ratio', 'Reservation_Status'],
      dtype='object')

In [149]:
sub_y = voting_class.predict(df_sub)

In [150]:
sub_y

array([2, 3, 1, ..., 2, 1, 1], dtype=int64)

In [151]:
df_sub['Reservation_Status'] = sub_y

In [152]:
df_sub

Unnamed: 0_level_0,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Adults,Children,Babies,...,adults_to_babies,total_people,checkin_day,checkin_week,checkin_dayofweek,checkin_month,checkin_year,days_to_booking,days_to_stay_ratio,Reservation_Status
Reservation-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62931593,1,52,1,1,4,3,1,3,3,0,...,0.000000,6,18,46,4,11,2016,21,21.000000,2
70586099,1,47,1,1,4,2,2,2,1,0,...,0.000000,3,18,46,4,11,2016,104,104.000000,3
4230648,1,28,4,1,1,2,1,2,2,0,...,0.000000,4,28,17,4,4,2017,20,6.666667,1
25192322,1,65,2,4,4,3,2,1,3,2,...,2.000000,6,18,46,4,11,2016,182,91.000000,2
80931528,2,45,3,3,4,3,1,3,1,0,...,0.000000,4,18,46,4,11,2016,18,9.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39110574,2,53,1,3,2,3,3,3,3,1,...,0.333333,7,29,13,2,3,2017,199,199.000000,3
5496554,1,63,2,3,2,3,3,3,3,0,...,0.000000,6,29,13,2,3,2017,47,15.666667,1
59004046,1,59,2,1,1,2,2,2,3,0,...,0.000000,5,29,13,2,3,2017,215,215.000000,2
65838682,2,43,3,3,4,4,3,4,2,1,...,0.250000,7,29,13,2,3,2017,25,25.000000,1


In [155]:
df_sub.Reservation_Status.to_frame().to_csv('../output/submission_csv_voting_with_xgb_13Mar_0634pm.csv')