In [1]:
import joblib
import sqlite3
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
conn = sqlite3.connect('noshow.db')
df = pd.read_sql_query("SELECT * FROM noshow", conn)

In [3]:
df=df.drop(columns=['index'])
df =df.rename(columns={"room_President Suite": "room_President_Suite"})
df['num_children'] =  df['num_children'].astype('int64')

In [4]:
df.set_index('booking_id')

Unnamed: 0_level_0,no_show,first_time,num_adults,num_children,SGD_amount,branch_Changi,branch_Orchard,country_Australia,country_China,country_India,...,room_King,room_President_Suite,room_Queen,room_Single,platform_Agent,platform_Email,platform_Phone,platform_Website,book_duration,stay_duration
booking_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
94113,0,1,1,0,492.980000,1,0,0,0,0,...,0,0,0,1,0,0,0,1,7,2
86543,0,1,2,0,1351.220000,0,1,0,0,0,...,1,0,0,0,0,0,0,1,3,1
75928,0,1,1,0,0.000000,1,0,0,0,1,...,0,0,0,1,1,0,0,0,11,4
66947,1,1,1,0,666.040000,0,1,0,1,0,...,0,0,0,1,0,0,0,1,0,2
106390,0,1,1,0,931.504906,0,1,1,0,0,...,0,0,1,0,0,0,0,1,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56220,0,1,2,1,0.000000,0,1,0,1,0,...,1,0,0,0,0,0,0,1,7,1
4823,0,1,1,1,857.039953,1,0,0,1,0,...,1,0,0,0,0,0,0,1,7,2
25062,0,1,2,0,1457.785508,0,1,0,1,0,...,1,0,0,0,0,1,0,0,9,4
81936,1,1,1,2,898.045376,1,0,0,0,0,...,1,0,0,0,0,1,0,0,6,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97778 entries, 0 to 97777
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   booking_id            97778 non-null  object 
 1   no_show               97778 non-null  int64  
 2   first_time            97778 non-null  int64  
 3   num_adults            97778 non-null  int64  
 4   num_children          97778 non-null  int64  
 5   SGD_amount            97778 non-null  float64
 6   branch_Changi         97778 non-null  int64  
 7   branch_Orchard        97778 non-null  int64  
 8   country_Australia     97778 non-null  int64  
 9   country_China         97778 non-null  int64  
 10  country_India         97778 non-null  int64  
 11  country_Indonesia     97778 non-null  int64  
 12  country_Japan         97778 non-null  int64  
 13  country_Malaysia      97778 non-null  int64  
 14  country_Singapore     97778 non-null  int64  
 15  room_King          

<h1>Assigning x, y</h1>

In [6]:
x = df.loc[:, ['first_time', 'num_adults', 'num_children', 'SGD_amount', 
               'branch_Changi', 'branch_Orchard', 'country_Australia','country_China','country_India',
               'country_Indonesia','country_Japan','country_Malaysia','country_Singapore',
               'room_King', 'room_President_Suite', 'room_Queen', 'room_Single',
               'platform_Agent', 'platform_Email', 'platform_Phone', 'platform_Website',
               'book_duration', 'stay_duration']]
y = df.loc[:,'no_show']

In [7]:
x.shape, y.shape

((97778, 23), (97778,))

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state=40)

<h2>Model1: XGBoost</h2>

In [9]:
xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.005,max_depth = 8, n_estimators = 180, random_state=40)
xgb.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.005, max_delta_step=0,
              max_depth=8, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=180, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=40,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
y_pred_xgb = xgb.predict(x_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
clf_report_xgb = classification_report(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb,output_dict=True)
f1_xgb = report_xgb['macro avg']['f1-score']

print(f"Accuracy Score of XGBoost is : {accuracy_xgb}")
print(f"F1 Score of XGBoost is : {f1_xgb}")
print(f"\nConfusion Matrix : \n{cm_xgb}")
print(f"\nClassification Report : \n{clf_report_xgb}")

Accuracy Score of XGBoost is : 0.7355628281175428
F1 Score of XGBoost is : 0.7014084983883397

Confusion Matrix : 
[[15749  2606]
 [ 5151  5828]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.75      0.86      0.80     18355
           1       0.69      0.53      0.60     10979

    accuracy                           0.74     29334
   macro avg       0.72      0.69      0.70     29334
weighted avg       0.73      0.74      0.73     29334



In [11]:
joblib.dump(xgb,"xgboost")

['xgboost']

<h2>Model 2: LGBM Classifier</h2>

In [12]:
lgbm = LGBMClassifier(learning_rate = 1)
num_leaves = [31, 51, 71, 91, 111, 200, 500]
max_depth = [-1, 50, 100, 150, 200, 300, 500]
learning_rate = [0.01, 0.05, 0.1, 0.5, 1.0]
reg_alpha = [0.0, 0.25, 0.5, 0.75, 1.0]
reg_lambda = [0.0, 0.5, 1.0]

lgbm_params = {"num_leaves" : num_leaves,
               "max_depth" : max_depth,
               "learning_rate" : learning_rate,
               "reg_alpha" : reg_alpha,
               "reg_lambda" : reg_lambda
               }

rscv_lgbm = RandomizedSearchCV(lgbm, 
                               lgbm_params, 
                               n_iter=10, 
                               scoring='accuracy', 
                               n_jobs=-1, 
                               cv=3, 
                               random_state=40, 
                               verbose=4)

In [13]:
rscv_lgbm.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=LGBMClassifier(learning_rate=1), n_jobs=-1,
                   param_distributions={'learning_rate': [0.01, 0.05, 0.1, 0.5,
                                                          1.0],
                                        'max_depth': [-1, 50, 100, 150, 200,
                                                      300, 500],
                                        'num_leaves': [31, 51, 71, 91, 111, 200,
                                                       500],
                                        'reg_alpha': [0.0, 0.25, 0.5, 0.75,
                                                      1.0],
                                        'reg_lambda': [0.0, 0.5, 1.0]},
                   random_state=40, scoring='accuracy', verbose=4)

In [14]:
lgbm_clf_best = rscv_lgbm.best_estimator_

In [15]:
lgbm_clf_best.fit(x_train, y_train)

LGBMClassifier(learning_rate=0.05, max_depth=300, num_leaves=111,
               reg_alpha=0.75, reg_lambda=0.5)

In [16]:
lgbm= LGBMClassifier(learning_rate=0.05, max_depth=300, num_leaves=111,
               reg_alpha=0.75, reg_lambda=0.5)

In [17]:
lgbm.fit(x_train, y_train)

LGBMClassifier(learning_rate=0.05, max_depth=300, num_leaves=111,
               reg_alpha=0.75, reg_lambda=0.5)

In [18]:
y_pred_lgbm = lgbm.predict(x_test)

accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
cm_lgbm = confusion_matrix(y_test, y_pred_lgbm)
clf_report_lgbm = classification_report(y_test, y_pred_lgbm)
report_lgbm = classification_report(y_test, y_pred_lgbm,output_dict=True)
f1_lgbm = report_lgbm['macro avg']['f1-score']


print(f"Accuracy Score of LGBM is : {accuracy_lgbm}")
print(f"F1 Score of LGBM is : {f1_lgbm}")
print(f"\nConfusion Matrix : \n{cm_lgbm}")
print(f"\nClassification Report : \n{clf_report_lgbm}")

Accuracy Score of LGBM is : 0.7372673348332992
F1 Score of LGBM is : 0.7028207647107683

Confusion Matrix : 
[[15807  2548]
 [ 5159  5820]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.75      0.86      0.80     18355
           1       0.70      0.53      0.60     10979

    accuracy                           0.74     29334
   macro avg       0.72      0.70      0.70     29334
weighted avg       0.73      0.74      0.73     29334



In [19]:
joblib.dump(lgbm,"lgbm")

['lgbm']

<h2>Model 3: Decision Tree</h2>

In [20]:
dt = DecisionTreeClassifier(max_depth=6, random_state=40,criterion='entropy')
dt.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=40)

In [21]:
y_pred_dt = dt.predict(x_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
cm_dt = confusion_matrix(y_test, y_pred_dt)
clf_report_dt = classification_report(y_test, y_pred_dt)
report_dt = classification_report(y_test, y_pred_dt,output_dict=True)
f1_dt = report_dt['macro avg']['f1-score']

print(f"Accuracy Score of Decision Tree is : {accuracy_dt}")
print(f"F1 Score of Decision Tree is : {f1_dt}")
print(f"\nConfusion Matrix : \n{cm_dt}")
print(f"\nClassification Report : \n{clf_report_dt}")

Accuracy Score of Decision Tree is : 0.7348810254312402
F1 Score of Decision Tree is : 0.6985063888700913

Confusion Matrix : 
[[15873  2482]
 [ 5295  5684]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.75      0.86      0.80     18355
           1       0.70      0.52      0.59     10979

    accuracy                           0.73     29334
   macro avg       0.72      0.69      0.70     29334
weighted avg       0.73      0.73      0.72     29334



In [22]:
joblib.dump(dt,"decisiontree")

['decisiontree']

<h2>Method 4: Random Forest with feature selector</h2>

In [23]:
rf = RandomForestClassifier()
feature_selector = RFE(rf, 20, verbose=3)
feature_selector.fit(x, y)

Fitting estimator with 23 features.




Fitting estimator with 22 features.
Fitting estimator with 21 features.


RFE(estimator=RandomForestClassifier(), n_features_to_select=20, verbose=3)

In [24]:
important_features = x.columns[feature_selector.support_]
x_imp = x[important_features]

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x_imp, y, test_size=0.3, random_state=40)
rf.fit(x_train, y_train)

RandomForestClassifier()

In [26]:
y_pred_rf = rf.predict(x_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)
clf_report_rf = classification_report(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf,output_dict=True)
f1_rf = report_rf['macro avg']['f1-score']

print(f"Accuracy Score of Random Forest is : {accuracy_rf}")
print(f"F1 Score of Random Forest is : {f1_rf}")
print(f"\nConfusion Matrix : \n{cm_rf}")
print(f"\nClassification Report : \n{clf_report_rf}")

Accuracy Score of Random Forest is : 0.6780186813936047
F1 Score of Random Forest is : 0.6451719783574186

Confusion Matrix : 
[[14407  3948]
 [ 5497  5482]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.72      0.78      0.75     18355
           1       0.58      0.50      0.54     10979

    accuracy                           0.68     29334
   macro avg       0.65      0.64      0.65     29334
weighted avg       0.67      0.68      0.67     29334



In [27]:
joblib.dump(rf,"randomforest")

['randomforest']

<h2>Model 5: Neural Network</h2>

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state=40)

In [29]:
nn = MLPClassifier(solver="lbfgs", hidden_layer_sizes = (5,5))
nn.fit(x,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(hidden_layer_sizes=(5, 5), solver='lbfgs')

In [30]:
y_pred_nn = nn.predict(x_test)




In [31]:
accuracy_nn = accuracy_score(y_test, y_pred_nn)
cm_nn = confusion_matrix(y_test, y_pred_nn)
clf_report_nn = classification_report(y_test, y_pred_nn)
report_nn = classification_report(y_test, y_pred_nn,output_dict=True)
f1_nn = report_nn['macro avg']['f1-score']


print(f"Accuracy Score of Neural Network (MLP Classfier) is : {accuracy_nn}")
print(f"F1 Score of Neural Network is : {f1_nn}")
print(f"\n\nConfusion Matrix : \n{cm_nn}")
print(f"\nClassification Report : \n{clf_report_nn}")


Accuracy Score of Neural Network (MLP Classfier) is : 0.6527919820004091
F1 Score of Neural Network is : 0.49865735903937886


Confusion Matrix : 
[[17707   648]
 [ 9537  1442]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.65      0.96      0.78     18355
           1       0.69      0.13      0.22     10979

    accuracy                           0.65     29334
   macro avg       0.67      0.55      0.50     29334
weighted avg       0.66      0.65      0.57     29334



In [32]:
joblib.dump(nn,"neural")

['neural']

<h2>Conclusion</h2>

In [33]:
#Re-posting all 5 models' accuracy:
print(f"1. Accuracy Score of XGBoost is : {accuracy_xgb}")
print(f"2. Accuracy Score of LGBM is : {accuracy_lgbm}")
print(f"3. Accuracy Score of Decision Tree is : {accuracy_dt}")
print(f"4. Accuracy Score of Random Forest is : {accuracy_rf}")
print(f"5. Accuracy Score of Neural Network (MLP Classfier) is : {accuracy_nn}")

#The selected 3 models to use are 1. XGBoost 2. LGBM 3. Decision Tree

1. Accuracy Score of XGBoost is : 0.7355628281175428
2. Accuracy Score of LGBM is : 0.7372673348332992
3. Accuracy Score of Decision Tree is : 0.7348810254312402
4. Accuracy Score of Random Forest is : 0.6780186813936047
5. Accuracy Score of Neural Network (MLP Classfier) is : 0.6527919820004091


In [34]:
x.columns

Index(['first_time', 'num_adults', 'num_children', 'SGD_amount',
       'branch_Changi', 'branch_Orchard', 'country_Australia', 'country_China',
       'country_India', 'country_Indonesia', 'country_Japan',
       'country_Malaysia', 'country_Singapore', 'room_King',
       'room_President_Suite', 'room_Queen', 'room_Single', 'platform_Agent',
       'platform_Email', 'platform_Phone', 'platform_Website', 'book_duration',
       'stay_duration'],
      dtype='object')

In [37]:
models = pd.DataFrame({
    'Model' : ['XGBoost', 'LightGBM', 'Decision Tree','Random Forest', 'Neural Network'],
    'Accuracy' : [accuracy_xgb, accuracy_lgbm, accuracy_dt, accuracy_rf, accuracy_nn],
    'F1 score': [f1_xgb, f1_lgbm, f1_dt, f1_rf, f1_nn]
})


In [38]:
models.sort_values(by = 'Accuracy', ascending = False)

Unnamed: 0,Model,Accuracy,F1 score
1,LightGBM,0.737267,0.702821
0,XGBoost,0.735563,0.701408
2,Decision Tree,0.734881,0.698506
3,Random Forest,0.678019,0.645172
4,Neural Network,0.652792,0.498657


In [41]:
models.sort_values(by = 'F1 score', ascending = False)

Unnamed: 0,Model,Accuracy,F1 score
1,LightGBM,0.737267,0.702821
0,XGBoost,0.735563,0.701408
2,Decision Tree,0.734881,0.698506
3,Random Forest,0.678019,0.645172
4,Neural Network,0.652792,0.498657
