#Hotel Reservation data predicting booking cancellation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import pandas as pd

# Load the dataset
file_path = 'Hotel_Reservations_Encoded.xlsx'
data = pd.read_excel(file_path)

data.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,0,0,224,2017,10,2,3,0,0,0,65.0,0,1
1,INN00002,2,0,2,3,0,0,5,2018,11,6,4,0,0,0,106.68,1,1
2,INN00003,1,0,2,1,0,0,1,2018,2,28,4,0,0,0,60.0,0,0
3,INN00004,2,0,0,2,0,0,211,2018,5,20,4,0,0,0,100.0,0,0
4,INN00005,2,0,1,1,0,0,48,2018,4,11,4,0,0,0,94.5,0,0


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Drop the 'Booking_ID' as it's just an identifier
data = data.drop(columns=['Booking_ID'])

# Check for missing values
missing_values = data.isnull().sum()

# Split the data into features (X) and target variable (y)
X = data.drop('booking_status', axis=1)
y = data['booking_status']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

missing_values, X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape



(no_of_adults                            0
 no_of_children                          0
 no_of_weekend_nights                    0
 no_of_week_nights                       0
 required_car_parking_space              0
 room_type_reserved                      0
 lead_time                               0
 arrival_year                            0
 arrival_month                           0
 arrival_date                            0
 market_segment_type                     0
 repeated_guest                          0
 no_of_previous_cancellations            0
 no_of_previous_bookings_not_canceled    0
 avg_price_per_room                      0
 no_of_special_requests                  0
 booking_status                          0
 dtype: int64,
 (29020, 16),
 (7255, 16),
 (29020,),
 (7255,))

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Implementing the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Predictions and evaluation for the KNN model
y_pred_knn = knn.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
report_knn = classification_report(y_test, y_pred_knn)

accuracy_knn, report_knn



(0.8512749827705031,
 '              precision    recall  f1-score   support\n\n           0       0.80      0.74      0.77      2416\n           1       0.87      0.91      0.89      4839\n\n    accuracy                           0.85      7255\n   macro avg       0.84      0.82      0.83      7255\nweighted avg       0.85      0.85      0.85      7255\n')

#KNN Model Results:

Accuracy: 85.13%


This means that the model correctly predicts whether a booking is canceled or not about 85% of the time.

For bookings that were not canceled (class 0):
Precision is 80%, meaning when it predicts a booking won't be canceled, it's correct 80% of the time.

Recall is 74%, meaning it correctly identifies 74% of all the bookings that weren't canceled.

For bookings that were canceled (class 1):
Precision is 87%, so when it predicts a cancellation, it's correct 87% of the time.

Recall is 91%, meaning it identifies 91% of all the actual cancellations.




In [10]:
from sklearn.tree import DecisionTreeClassifier

# Train the Decision Tree model
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = decision_tree.predict(X_test)

# Evaluate the Decision Tree model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
report_dt = classification_report(y_test, y_pred_dt)

accuracy_dt, report_dt

(0.8687801516195727,
 '              precision    recall  f1-score   support\n\n           0       0.80      0.81      0.80      2416\n           1       0.90      0.90      0.90      4839\n\n    accuracy                           0.87      7255\n   macro avg       0.85      0.85      0.85      7255\nweighted avg       0.87      0.87      0.87      7255\n')

#Decision Tree Model Results:

Accuracy: 86.88%. This model is correct about 87% of the time in predicting cancellations.

For bookings that were not canceled (class 0):

Precision is 80%, and recall is 81%. It's slightly better at catching non-cancellations than the KNN model.

For bookings that were canceled (class 1):

Precision is 90%, and recall is 90%. This model is slightly better at identifying cancellations compared to the KNN model.

In [11]:
#hyperparameter tuning
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models
knn = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier(random_state=42)

# Parameters for GridSearchCV
knn_params = {'n_neighbors': range(1, 31, 2)}
dt_params = {'max_depth': range(1, 21)}

# GridSearchCV for KNN
grid_knn = GridSearchCV(knn, knn_params, cv=5, scoring='accuracy')
grid_knn.fit(X_train_scaled, y_train)

# GridSearchCV for Decision Tree
grid_dt = GridSearchCV(decision_tree, dt_params, cv=5, scoring='accuracy')
grid_dt.fit(X_train, y_train)

# Best parameters and scores
knn_best_params = grid_knn.best_params_
knn_best_score = grid_knn.best_score_
dt_best_params = grid_dt.best_params_
dt_best_score = grid_dt.best_score_

knn_best_params, knn_best_score, dt_best_params, dt_best_score



({'n_neighbors': 5}, 0.8484148862853205, {'max_depth': 11}, 0.8730186078566506)


#Hyperparametr Tuning
#KNN Model:
Optimal Number of Neighbors: 5

The model selects 5 bookings most similar to the one we're trying to predict. These are the 'neighbors.' The model then sees what happened with those bookings (canceled or not) and uses that information to make a prediction.

Best Cross-Validation Score: 84.84%

#Decision Tree Model:

Optimal Max Depth: 11

Decision Tree expands model like a flowchart that makes decisions at each step. The "max depth" of 11 means this flowchart has 11 levels of questions. The model has found that asking 11 questions in a row gives the best predictions.

Best Cross-Validation Score: 87.30%

Using the same cross-validation technique as KNN, the Decision Tree scores 87.30%. This means it's slightly more accurate, correctly predicting cancellations about 87% of the time.

In [14]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Retrain KNN Model with optimal parameters
knn_optimal = KNeighborsClassifier(n_neighbors=5)
knn_optimal.fit(X_train_smote, y_train_smote)

# Retrain Decision Tree Model with optimal parameters
dt_optimal = DecisionTreeClassifier(max_depth=11, random_state=42)
dt_optimal.fit(X_train_smote, y_train_smote)

# Evaluate KNN Model on test set
knn_pred = knn_optimal.predict(X_test_scaled)
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_report = classification_report(y_test, knn_pred)

# Evaluate Decision Tree Model on test set
dt_pred = dt_optimal.predict(X_test_scaled)
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_report = classification_report(y_test, dt_pred)

knn_accuracy, knn_report, dt_accuracy, dt_report




(0.8293590627153687,
 '              precision    recall  f1-score   support\n\n           0       0.71      0.82      0.76      2416\n           1       0.90      0.83      0.87      4839\n\n    accuracy                           0.83      7255\n   macro avg       0.81      0.83      0.81      7255\nweighted avg       0.84      0.83      0.83      7255\n',
 0.8657477601654032,
 '              precision    recall  f1-score   support\n\n           0       0.79      0.81      0.80      2416\n           1       0.90      0.90      0.90      4839\n\n    accuracy                           0.87      7255\n   macro avg       0.85      0.85      0.85      7255\nweighted avg       0.87      0.87      0.87      7255\n')

#**New Results (After SMOTE):**
**KNN Model:**

Accuracy: 82.94%

Classification Report: Higher recall for class 0, indicating better identification of this class compared to before, but at the cost of lower overall accuracy.

**Decision Tree Model:**

Accuracy: 86.57%

Classification Report: Similar performance to the previous result, with a slight decrease in overall accuracy but maintaining a good balance in precision and recall.

**Analysis:**

**KNN Model:** The use of SMOTE has improved the model's ability to identify the minority class (class 0), as indicated by the increased recall. However, this improvement comes at the cost of reduced overall accuracy and precision for the majority class (class 1).

**Decision Tree Model:** The performance remains relatively stable even after applying SMOTE, with a slight decrease in accuracy. It indicates that the Decision Tree model was less affected by the class imbalance initially.

**Conclusion:**

The Decision Tree model continues to outperform the KNN model in terms of overall accuracy.
SMOTE improved the KNN model's ability to identify the minority class but slightly reduced its overall accuracy.
Decision Tree model's performance remained robust even after addressing class imbalance, which suggests its suitability for this particular dataset.