# Import Packages

In [41]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [59]:
LABEL_COL = 'Reservation_Status'

In [2]:
TRAIN_SET = '../data/Hotel-A-train.csv'
VAL_SET = '../data/Hotel-A-validation.csv'

In [180]:
df = pd.read_csv(TRAIN_SET, index_col='Reservation-id', parse_dates=['Expected_checkin', 'Expected_checkout'])

In [181]:
df_val = pd.read_csv(VAL_SET, index_col='Reservation-id', parse_dates=['Expected_checkin', 'Expected_checkout'])

# Data Preprocessing

## Drop columns

In [182]:
df = df.drop(columns=['Expected_checkin', 'Expected_checkout', 'Booking_date'])
df_val = df_val.drop(columns=['Expected_checkin', 'Expected_checkout', 'Booking_date'])

## Categorical encoding

In [183]:
categorical_features_indices = np.where((df.dtypes != np.float) & (df.dtypes != np.int64))[0]

In [184]:
cat_cols = list(set(df.columns[categorical_features_indices].to_list()) - set([LABEL_COL]))

In [185]:
le = OneHotEncoder(cols=cat_cols)

In [186]:
le.fit(df)

OneHotEncoder(cols=['Educational_Level', 'Gender', 'Income', 'Hotel_Type',
                    'Meal_Type', 'Use_Promotion', 'Previous_Cancellations',
                    'Country_region', 'Ethnicity', 'Required_Car_Parking',
                    'Visted_Previously', 'Deposit_type', 'Booking_channel'],
              drop_invariant=False, handle_missing='value',
              handle_unknown='value', return_df=True, use_cat_names=False,
              verbose=0)

In [187]:
df = le.transform(df)
df_val = le.transform(df_val)

# Create Train/Test Dataset

In [188]:
X = df.drop(columns='Reservation_Status')
y = df['Reservation_Status']

In [189]:
val_X = df_val.drop(columns='Reservation_Status')
val_y = df_val['Reservation_Status']

In [190]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [191]:
categorical_features_indices

array([ 0,  2,  3,  4,  5,  6, 10, 11, 12, 13, 14, 15, 16, 17],
      dtype=int64)

In [192]:
df.dtypes

Gender_1                     int64
Gender_2                     int64
Age                          int64
Ethnicity_1                  int64
Ethnicity_2                  int64
Ethnicity_3                  int64
Ethnicity_4                  int64
Educational_Level_1          int64
Educational_Level_2          int64
Educational_Level_3          int64
Educational_Level_4          int64
Income_1                     int64
Income_2                     int64
Income_3                     int64
Income_4                     int64
Country_region_1             int64
Country_region_2             int64
Country_region_3             int64
Country_region_4             int64
Hotel_Type_1                 int64
Hotel_Type_2                 int64
Hotel_Type_3                 int64
Adults                       int64
Children                     int64
Babies                       int64
Meal_Type_1                  int64
Meal_Type_2                  int64
Meal_Type_3                  int64
Visted_Previously_1 

# Model Training

In [201]:
model = LGBMClassifier(n_estimators=5000, n_jobs=-1, max_depth=4, class_weight='balanced', num_leaves=7)

In [202]:
model.fit(train_X, train_y)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=4, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=5000, n_jobs=-1, num_leaves=7,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

# Model Evaluation

In [203]:
pred_y = model.predict(test_X)
pred_y_train = model.predict(train_X)
pred_y_val = model.predict(val_X)

In [204]:
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

    Canceled       0.19      0.28      0.22       810
    Check-In       0.80      0.69      0.74      4267
     No-Show       0.10      0.14      0.11       423

    accuracy                           0.59      5500
   macro avg       0.36      0.37      0.36      5500
weighted avg       0.66      0.59      0.62      5500



In [205]:
print(f1_score(train_y, pred_y_train, average='macro'))

0.8061936893692124


In [206]:
print(f1_score(test_y, pred_y, average='macro'))

0.36039211468571564


In [207]:
print(f1_score(val_y, pred_y_val, average='macro'))

0.3396155976802225


In [200]:
pd.Series(model.feature_importances_, index=train_X.columns).sort_values(ascending=False)

Room_Rate                   4290
Age                         3095
Discount_Rate               1030
Adults                       931
Children                     623
Babies                       359
Educational_Level_1          292
Required_Car_Parking_1       290
Booking_channel_1            275
Hotel_Type_2                 274
Meal_Type_2                  265
Income_4                     264
Previous_Cancellations_1     262
Country_region_3             257
Educational_Level_2          252
Use_Promotion_1              251
Income_1                     250
Ethnicity_2                  237
Educational_Level_3          235
Educational_Level_4          235
Hotel_Type_1                 234
Gender_1                     233
Meal_Type_1                  230
Meal_Type_3                  227
Deposit_type_3               225
Deposit_type_2               225
Visted_Previously_1          223
Ethnicity_3                  217
Ethnicity_4                  212
Country_region_4             210
Ethnicity_