# Import Packages

In [1075]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

In [210]:
LABEL_COL = 'Reservation_Status'

In [211]:
TRAIN_SET = '../data/Hotel-A-train.csv'
VAL_SET = '../data/Hotel-A-validation.csv'
TEST_SET = '../data/Hotel-A-test.csv'

In [802]:
df = pd.read_csv(TRAIN_SET, index_col='Reservation-id', parse_dates=['Expected_checkin', 'Expected_checkout'])
df_val = pd.read_csv(VAL_SET, index_col='Reservation-id', parse_dates=['Expected_checkin', 'Expected_checkout'])
df_test = pd.read_csv(TEST_SET, index_col='Reservation-id', parse_dates=['Expected_checkin', 'Expected_checkout'])

# Data Preprocessing

In [803]:
df[LABEL_COL] = df[LABEL_COL].map({'Check-In':1, 'Canceled':2, 'No-Show':3}).astype(int)
df_val[LABEL_COL] = df_val[LABEL_COL].map({'Check-In':1, 'Canceled':2, 'No-Show':3}).astype(int)

## Feature Engineering

In [804]:
df['income_amount'] = df.Income.map({'<25K':25, '50K -- 100K':100, '25K --50K':50, '>100K':200})
df_val['income_amount'] = df_val.Income.map({'<25K':25, '50K -- 100K':100, '25K --50K':50, '>100K':200})
df_test['income_amount'] = df_test.Income.map({'<25K':25, '50K -- 100K':100, '25K --50K':50, '>100K':200})

In [805]:
df['num_rooms'] = np.ceil((df.Adults + df.Children)/5).astype(int)
df_val['num_rooms'] = np.ceil((df_val.Adults + df_val.Children)/5).astype(int)
df_test['num_rooms'] = np.ceil((df_test.Adults + df_test.Children)/5).astype(int)

In [806]:
df['total_cost'] = df.num_rooms*df.Room_Rate
df_val['total_cost'] = df_val.num_rooms*df_val.Room_Rate
df_test['total_cost'] = df_test.num_rooms*df_test.Room_Rate*(1 - df_test.Discount_Rate/100)

In [807]:
df['income_ratio'] = df.income_amount/df.total_cost
df_val['income_ratio'] = df_val.income_amount/df_val.total_cost
df_test['income_ratio'] = df_test.income_amount/df_test.total_cost

In [808]:
# df['checkin_month'] = df.Expected_checkin.dt.month
df['checkin_day'] = df.Expected_checkin.dt.day
df['checkin_week'] = df.Expected_checkin.dt.week
df['checkin_dayofweek'] = df.Expected_checkin.dt.dayofweek

# df_val['checkin_month'] = df_val.Expected_checkin.dt.month
df_val['checkin_day'] = df_val.Expected_checkin.dt.day
df_val['checkin_week'] = df_val.Expected_checkin.dt.week
df_val['checkin_dayofweek'] = df_val.Expected_checkin.dt.dayofweek

# df_test['checkin_month'] = df_test.Expected_checkin.dt.month
df_test['checkin_day'] = df_test.Expected_checkin.dt.day
df_test['checkin_week'] = df_test.Expected_checkin.dt.week
df_test['checkin_dayofweek'] = df_test.Expected_checkin.dt.dayofweek

## Drop columns

In [809]:
drop_cols = ['Expected_checkin', 'Expected_checkout', 'Booking_date', 'num_rooms']

In [810]:
df = df.drop(columns=drop_cols)
df_val = df_val.drop(columns=drop_cols)
df_test = df_test.drop(columns=drop_cols)

In [811]:
df_test[LABEL_COL] = 'abc'

## Categorical encoding

In [812]:
categorical_features_indices = np.where((df.dtypes != np.float) & (df.dtypes != np.int64))[0]

In [813]:
cat_cols = list(set(df.columns[categorical_features_indices].to_list()) - set([LABEL_COL]))

In [814]:
le = OrdinalEncoder(cols=cat_cols)

In [815]:
le.fit(df)

OrdinalEncoder(cols=['Required_Car_Parking', 'Hotel_Type', 'Educational_Level',
                     'Gender', 'Visted_Previously', 'Country_region',
                     'Previous_Cancellations', 'Ethnicity', 'Booking_channel',
                     'Deposit_type', 'Income', 'Meal_Type', 'Use_Promotion'],
               mapping=[{'col': 'Required_Car_Parking', 'data_type': dtype('O'),
                         'mapping': Yes    1
No     2
NaN   -2
dtype: int64},
                        {'col': 'Hotel_Type', 'd...
                        {'col': 'Deposit_type', 'data_type': dtype('O'),
                         'mapping': No Deposit        1
Refundable        2
Non-Refundable    3
NaN              -2
dtype: int64},
                        {'col': 'Income', 'data_type': dtype('O'),
                         'mapping': <25K           1
50K -- 100K    2
>100K          3
25K --50K      4
NaN           -2
dtype: int64},
                        {'col': 'Meal_Type', 'data_type': dtype('O'),
     

In [816]:
df = le.transform(df)
df_val = le.transform(df_val)
df_test = le.transform(df_test)

In [817]:
df[cat_cols] = df[cat_cols].astype('category')
df_val[cat_cols] = df_val[cat_cols].astype('category')
df_test[cat_cols] = df_test[cat_cols].astype('category')

# Create Train/Test Dataset

In [818]:
X = df.drop(columns='Reservation_Status')
y = df['Reservation_Status']

In [819]:
val_X = df_val.drop(columns='Reservation_Status')
val_y = df_val['Reservation_Status']

In [820]:
sub_X = df_test.drop(columns='Reservation_Status')

In [1057]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=10)

## Oversample using SMOTE

In [1058]:
categorical_features_indices = categorical_features_indices[:-1]

In [1059]:
categorical_features_indices

array([ 0,  2,  3,  4,  5,  6, 10, 11, 12, 13], dtype=int64)

In [1060]:
df.dtypes

Gender                    category
Age                          int64
Ethnicity                 category
Educational_Level         category
Income                    category
Country_region            category
Hotel_Type                category
Adults                       int64
Children                     int64
Babies                       int64
Meal_Type                 category
Visted_Previously         category
Previous_Cancellations    category
Deposit_type              category
Booking_channel           category
Required_Car_Parking      category
Reservation_Status           int32
Use_Promotion             category
Discount_Rate                int64
Room_Rate                    int64
income_amount                int64
total_cost                   int64
income_ratio               float64
checkin_day                  int64
checkin_week                 int64
checkin_dayofweek            int64
dtype: object

# Model Training

In [1089]:
model = LGBMClassifier(n_estimators=400, n_jobs=-1, max_depth=2, num_leaves=12
                      ,random_state=2, class_weight='balanced', min_child_samples=300, subsample=1)

In [1090]:
model.fit(train_X, train_y,categorical_feature='auto', verbose=0)

LGBMClassifier(class_weight='balanced', max_depth=2, min_child_samples=300,
               n_estimators=400, num_leaves=12, random_state=2, subsample=1)

# Model Evaluation

In [1091]:
pred_y = model.predict(test_X)
pred_y_train = model.predict(train_X)
pred_y_val = model.predict(val_X)

In [1092]:
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           1       0.85      0.58      0.69      4248
           2       0.21      0.33      0.26       811
           3       0.10      0.31      0.16       441

    accuracy                           0.52      5500
   macro avg       0.39      0.41      0.37      5500
weighted avg       0.69      0.52      0.58      5500



In [1093]:
print(f1_score(train_y, pred_y_train, average='macro'))

0.41828177983625453


In [1094]:
print(f1_score(test_y, pred_y, average='macro'))

0.36615657728919254


In [1095]:
print(f1_score(val_y, pred_y_val, average='macro'))

0.3427719159457206


In [1096]:
pd.Series(model.feature_importances_, index=train_X.columns).sort_values(ascending=False)

checkin_week              497
income_ratio              438
total_cost                349
checkin_day               338
Age                       316
Room_Rate                 303
checkin_dayofweek         192
Meal_Type                 147
Discount_Rate             115
Ethnicity                  92
Country_region             84
Educational_Level          80
Adults                     78
Hotel_Type                 59
Income                     54
Deposit_type               51
Babies                     45
Use_Promotion              44
Children                   41
Booking_channel            32
Previous_Cancellations     26
Gender                     21
Visted_Previously          20
Required_Car_Parking       19
income_amount               2
dtype: int32

In [1097]:
df_test

Unnamed: 0_level_0,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Adults,Children,Babies,...,Discount_Rate,Room_Rate,income_amount,total_cost,income_ratio,checkin_day,checkin_week,checkin_dayofweek,Reservation_Status,pred_y
Reservation-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62931593,1,52,1,1,4,3,1,3,3,0,...,10,153,50,275.40,0.181554,18,46,4,abc,3
70586099,1,47,1,1,4,2,2,2,1,0,...,0,210,50,210.00,0.238095,18,46,4,abc,3
4230648,1,28,4,1,1,2,1,2,2,0,...,5,117,25,111.15,0.224921,28,17,4,abc,1
25192322,1,65,2,4,4,3,2,1,3,2,...,10,107,50,96.30,0.519211,18,46,4,abc,2
80931528,2,45,3,3,4,3,1,3,1,0,...,0,119,50,119.00,0.420168,18,46,4,abc,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39110574,2,53,1,3,2,3,3,3,3,1,...,0,153,100,306.00,0.326797,29,13,2,abc,3
5496554,1,63,2,3,2,3,3,3,3,0,...,40,233,100,279.60,0.357654,29,13,2,abc,3
59004046,1,59,2,1,1,2,2,2,3,0,...,0,242,25,242.00,0.103306,29,13,2,abc,2
65838682,2,43,3,3,4,4,3,4,2,1,...,0,208,50,416.00,0.120192,29,13,2,abc,1


In [1098]:
sub_y = model.predict(sub_X)

In [1099]:
sub_y

array([3, 3, 1, ..., 2, 1, 3])

In [1100]:
df_test['pred_y'] = sub_y

In [1101]:
sub_s = df_test.pred_y

In [1102]:
sub_s.name = 'Reservation_status'

In [1103]:
sub_s.to_frame().to_csv('../output/submission_file_12Mar_01_16_am.csv', header=True)