# XGBoost Model

XGBoost --> Extreme Boost Gradient - an implementation of gradient boosted trees designed for speed and accuracy

Gradient boosting is a supervised machine learning algorithm, which tries to predict a target variable by combining the estimates of a set of simpler, weaker models. It only works with numerical values, thus all categorical variables have to be encoded.

In boosting, the trees are built in a *sequential manner* such that each subsequent tree aims to *reduce the errors of the previous tree*. The *misclassified labels are given higher weights*. Each tree learns from its predecessors and tries to reduce the residual errors. So, the tree next in sequence will learn from the previous tree residuals.

In [1]:
# !pip install xgboost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score

# Data Cleaning / Pre-processing

In [2]:
# Import dataset
data = 'hotelbooking_cleaned.csv'
df = pd.read_csv(data)
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,agent_encoded,company_encoded
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,1,1
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,,0,Transient,0.00,0,0,Check-Out,2015-07-01,1,1
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,1,1
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,,0,Transient,75.00,0,0,Check-Out,2015-07-02,0,1
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,,0,Transient,98.00,0,1,Check-Out,2015-07-03,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118897,City Hotel,0,23,2017,August,35,30,2,5,2,...,,0,Transient,96.14,0,0,Check-Out,2017-09-06,0,1
118898,City Hotel,0,102,2017,August,35,31,2,5,3,...,,0,Transient,225.43,0,2,Check-Out,2017-09-07,0,1
118899,City Hotel,0,34,2017,August,35,31,2,5,2,...,,0,Transient,157.71,0,4,Check-Out,2017-09-07,0,1
118900,City Hotel,0,109,2017,August,35,31,2,5,2,...,,0,Transient,104.40,0,0,Check-Out,2017-09-07,0,1


In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
# Perform one-hot encoding
transformed_df = df[['lead_time', 'agent_encoded', 'company_encoded']].copy()
transformed_df['arrival_date_week_number'] = df['arrival_date_week_number']
attributes_to_encode = ['country', 'deposit_type', 'market_segment', 'assigned_room_type', 'distribution_channel', 'customer_type']
for attribute in attributes_to_encode:
    onehot_encoder = OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoder.fit_transform(df[[attribute]])
    onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out([attribute]))

    # Concatenate the one-hot encoded features with the original dataframe
    transformed_df = pd.concat([transformed_df, onehot_encoded_df], axis=1)
    print(transformed_df.shape)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(transformed_df[['lead_time', 'arrival_date_week_number']])
transformed_df[['lead_time', 'arrival_date_week_number']] = scaled_data

(118902, 181)
(118902, 184)
(118902, 192)
(118902, 204)
(118902, 209)
(118902, 213)


# Split data into separate training & test set

In [4]:
X = transformed_df
y = df['is_canceled']
print(X.shape, y.shape)

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

(118902, 213) (118902,)


# Train the XGBoost Classifier Model

This is the base XGBoost Classifier Model

In [5]:
# Base Model without hyperparameter tuning
import xgboost as xgb

xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

clf_report = classification_report(y_test, y_pred)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f"Classification Report : \n{clf_report}")

Accuracy: 0.8112752656219337
Precision: 0.8094746716697936
Recall: 0.6473368342085521
F1 Score: 0.7193830762817841
Classification Report : 
              precision    recall  f1-score   support

           0       0.81      0.91      0.86     22341
           1       0.81      0.65      0.72     13330

    accuracy                           0.81     35671
   macro avg       0.81      0.78      0.79     35671
weighted avg       0.81      0.81      0.81     35671



# Hyperparameter Tuning with RandomizedSearchCV

1. Selecting the hyperparameters that should be tuned to obtain better predictive results
2. Perform RandomizedSearchCV to obtain the best hyperparameter values 
3. Fit the hyperparameters identified above to achieve performance metrics results

In [11]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, KFold
from scipy import stats

xgb_model = xgb.XGBClassifier()

params = {
            'objective':['binary:logistic'],
            'max_depth': stats.randint(3, 10),
            'learning_rate': stats.uniform(0.01,0.5),
            'subsample': stats.uniform(0.01,0.8),
            'n_estimators': stats.randint(100,500)
         }  


random_search=RandomizedSearchCV(xgb_model,param_distributions=params,
                            n_iter=40,scoring='precision',cv=10,
                            random_state=0)

random_search.fit(X, y)

print("Best parameters found: ")
print(random_search.best_params_)
print("---------------------------")

best_model = random_search.best_estimator_

y_pred_best = best_model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred_best)
precision = precision_score(y_test, y_pred_best)
recall = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)

# Print the performance metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print("---------------------------")
print(f"Classification Report : \n{clf_report}")

Best parameters found: 
{'learning_rate': 0.22664403099102204, 'max_depth': 3, 'n_estimators': 119, 'objective': 'binary:logistic', 'subsample': 0.32687862033868986}
---------------------------
Accuracy: 0.8024052815272696
Precision: 0.8104767510300177
Recall: 0.6175580221997982
F1 Score: 0.7009863188036908
---------------------------
Classification Report : 
              precision    recall  f1-score   support

           0       0.81      0.91      0.86     14862
           1       0.81      0.65      0.72      8919

    accuracy                           0.81     23781
   macro avg       0.81      0.78      0.79     23781
weighted avg       0.81      0.81      0.81     23781

