
**PROJECT 2:** DEVELOPMENT OF PREDICTIVE MODEL ON HOTEL BOOKING DEMAND DATASET

**TITLE:** COMPARATIVE STUDY ON HOTEL BOOKING DEMAND DATASET CLASSIFICATION MODELS

**Subject/Class :** TTTC3283 DATA MINING

**Google Colab :** https://colab.research.google.com/drive/1082CdlRFq0zmGOS2GyWf9z6zCKnCXPfG?usp=sharing

# **Import Data**

In [None]:
#To import necessary modules and libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame, Series
from IPython.display import Image 
from io import StringIO
import pydotplus
from sklearn import preprocessing
from sklearn import tree
%matplotlib inline

#To import modules and libraries to split dataset
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree #for Decision Tree
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

#To import modules for Neural Network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

#To import module and libraries for evaluation metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
# Load dataset
df = pd.read_csv('hotel_bookings_cleaned.csv', engine='python')
df.shape
df.head(5)

In [None]:
#To check the datatype of each attribute
df.dtypes

# **Decision Tree**

In [None]:
# Load dataset
df = pd.read_csv('hotel_bookings_cleaned.csv', engine='python')
df.shape
df

In [None]:
#To drop any null values
df = df.dropna()

In [None]:
#To split dataset into training, testing and validation dataset
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=0)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=0)

In [None]:
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

In [None]:
#To specify input variables (x) and target variable (y)
input_cols = ['hotel', 'lead_time', 'arrival_date_week_number',
            'arrival_date_day_of_month', 'stays_in_weekend_nights',
            'stays_in_week_nights', 'adults', 'kids', 'meal',
            'market_segment', 'distribution_channel',
            'is_repeated_guest', 'previous_cancellations',
            'previous_bookings_not_canceled', 'reserved_room_type',
            'deposit_type',
            'customer_type',
            'required_car_parking_spaces', 'total_of_special_requests',
            'year', 'month','day']

target_cols = 'is_canceled'

In [None]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_cols].copy()
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_cols].copy()
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_cols].copy()

In [None]:
X_train = train_inputs
X_val = val_inputs
X_test = test_inputs

In [None]:
Y_test = test_targets

In [None]:
# Create the model
model = DecisionTreeClassifier(random_state=0)

# Fit the model to the training data
model.fit(X_train, train_targets)

In [None]:
Y_Pred = model.predict(X_test)

In [None]:
#To get the maximum depth of Decision Tree
model.tree_.max_depth

In [None]:
#To plot the Decision Tree
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=X_train.columns, max_depth=35, filled=True);

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=X_train.columns, max_depth=10, filled=True);

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=X_train.columns, max_depth=5, filled=True);

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=X_train.columns, max_depth=3, filled=True);

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=X_train.columns, max_depth=2, filled=True);

In [None]:
#To import module and library
from sklearn.metrics import accuracy_score, confusion_matrix

#To plot the confusion matrix based on prediction
def predict_and_plot(inputs, targets, name=''):
    preds = model.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))

    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

In [None]:
#To obtain accuracy results
accuracy = accuracy_score(Y_test,Y_Pred)
print("Accuracy: ", accuracy)

In [None]:
train_preds = predict_and_plot(X_train, train_targets, 'Train')
val_preds = predict_and_plot(X_val, val_targets, 'Validation')
test_preds = predict_and_plot(X_test, test_targets, 'Test')

**Classification Report**

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_Pred, Y_test))

**ROC Curve**

In [None]:
#To obtain AUC score and fit data to model to plot the graph
Y_Score1 = model.predict_proba(X_test)[:,1] 
FP_rate1, TP_rate1, threshold1 = roc_curve(Y_test, Y_Score1)
print('roc_auc_score for DecisionTree: ', roc_auc_score(Y_test, Y_Score1))

In [None]:
#To plot the ROC Graph
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - DecisionTree')
plt.plot(FP_rate1, TP_rate1)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**Mean Squared Error**

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_Pred))  

**Mean Absolute Error**

In [None]:
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_Pred))  

**R-Squared**

In [None]:
from sklearn.metrics import r2_score
print('r2_score:', r2_score(Y_test, Y_Pred))

**Logarithmic Loss**

In [None]:
from sklearn.metrics import log_loss
print('log_loss', log_loss(Y_test, Y_Pred))

**To test if there is Overfitting or Underfitting of data.**

In [None]:
print('Training set score: {:.4f}'.format(model.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(model.score(X_test, Y_test)))

# **Naive Bayes**

In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from IPython.display import Image 
from io import StringIO
import pydotplus
from sklearn import preprocessing
from sklearn import tree
%matplotlib inline

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# Load dataset
df = pd.read_csv('hotel_bookings_cleaned.csv', engine='python')
df.shape
df.head(5)

In [None]:
df = df.dropna()

In [None]:
#To split dataset into training, testing and validation dataset
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=0)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=0)

In [None]:
#X: input_cols, Y: target_cols
input_cols = ['hotel', 'lead_time', 'arrival_date_week_number',
            'arrival_date_day_of_month', 'stays_in_weekend_nights',
            'stays_in_week_nights', 'adults', 'kids', 'meal',
            'market_segment', 'distribution_channel',
            'is_repeated_guest', 'previous_cancellations',
            'previous_bookings_not_canceled', 'reserved_room_type',
            'deposit_type',
            'customer_type',
            'required_car_parking_spaces', 'total_of_special_requests',
            'year', 'month','day']

target_cols = 'is_canceled'

train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_cols].copy()
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_cols].copy()
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_cols].copy()

In [None]:
X_train = train_inputs
X_val = val_inputs
X_test = test_inputs

Y_test = test_targets
Y_val = val_targets
Y_train = train_targets

In [None]:
#To fit the data to the model
model3 = MultinomialNB().fit(X_train, Y_train)
model4 = GaussianNB().fit(X_train, Y_train)
model5 = BernoulliNB().fit(X_train, Y_train)

In [None]:
#To predict
Y_pred3 = model3.predict(X_test)
Y_pred4 = model4.predict(X_test)
Y_pred5 = model5.predict(X_test)

In [None]:
accuracy = accuracy_score(Y_pred3, Y_test)
print('Accuracy of Multinomial Naive Bayes: {:.5f}'.format(accuracy))

accuracy = accuracy_score(Y_pred4, Y_test)
print('Accuracy of Gaussian Naive Bayes: {:.5f}'.format(accuracy))

accuracy = accuracy_score(Y_pred5, Y_test)
print('Accuracy of Bernoulli Naive Bayes: {:.5f}'.format(accuracy))

### **MultinomialNB**

In [None]:
#To plot the confusion matrix based on model's prediction
def predict_and_plot(inputs, targets, name=''):
    preds = model3.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))

    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

In [None]:
train_preds = predict_and_plot(X_train, train_targets, 'Train')
val_preds = predict_and_plot(X_val, val_targets, 'Validation')
test_preds = predict_and_plot(X_test, test_targets, 'Test')

In [None]:
#To get the classification report
from sklearn.metrics import classification_report
print(classification_report(Y_pred3, Y_test))

In [None]:
#To get the ROC score
Y_score3 = model3.predict_proba(X_test)[:,1]
FP_rate3, TP_rate3, threshold3 = roc_curve(Y_test, Y_score3)
print('roc_auc_score for MultinomialNB: ', roc_auc_score(Y_test, Y_score3))

In [None]:
#To plot the ROC curve
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - MultinomialNB')
plt.plot(FP_rate3, TP_rate3)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**Mean Squared Error**

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred3))  

**Mean Absolute Error**

In [None]:
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred3))  

**R-Squared**

In [None]:
from sklearn.metrics import r2_score
print('r2_score_lng:', r2_score(Y_test, Y_pred3))

**To test if there is Overfitting or Underfitting of data.**

In [None]:
print('Training set score: {:.4f}'.format(model3.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(model3.score(X_test, Y_test)))

### **BernoulliNB**

In [None]:
def predict_and_plot(inputs, targets, name=''):
    preds = model4.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))

    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

In [None]:
train_preds = predict_and_plot(X_train, train_targets, 'Train')
val_preds = predict_and_plot(X_val, val_targets, 'Validation')
test_preds = predict_and_plot(X_test, test_targets, 'Test')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_pred4, Y_test))

In [None]:
Y_score4 = model4.predict_proba(X_test)[:,1]
FP_rate4, TP_rate4, threshold4 = roc_curve(Y_test, Y_score4)
print('roc_auc_score for BernoulliNB: ', roc_auc_score(Y_test, Y_score4))

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - BernoulliNB')
plt.plot(FP_rate4, TP_rate4)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**Mean Squared Error**

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred4))  

**Mean Absolute Error**

In [None]:
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred4))  

**R-Squared**

In [None]:
from sklearn.metrics import r2_score
print('r2_score:', r2_score(Y_test, Y_pred4))

**To test if there is Overfitting or Underfitting of data.**

In [None]:
print('Training set score: {:.4f}'.format(model4.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(model4.score(X_test, Y_test)))

### **GaussianNB**

In [None]:
def predict_and_plot(inputs, targets, name=''):
    preds = model5.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))

    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

In [None]:
train_preds = predict_and_plot(X_train, train_targets, 'Train')
val_preds = predict_and_plot(X_val, val_targets, 'Validation')
test_preds = predict_and_plot(X_test, test_targets, 'Test')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_pred5, Y_test))

In [None]:
Y_score5 = model5.predict_proba(X_test)[:,1]
FP_rate5, TP_rate5, threshold5 = roc_curve(Y_test, Y_score5)
print('roc_auc_score for GaussianNB: ', roc_auc_score(Y_test, Y_score5))

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - GaussianNB')
plt.plot(FP_rate5, TP_rate5)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**Mean Squared Error**

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred5))  

**Mean Absolute Error**

In [None]:
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred5))  

**R-Squared**

In [None]:
from sklearn.metrics import r2_score
print('r2_score:', r2_score(Y_test, Y_pred4))

**To test if there is Overfitting or Underfitting of data.**

In [None]:
print('Training set score: {:.4f}'.format(model5.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(model5.score(X_test, Y_test)))

### **Logarithmic Loss**

In [None]:
from sklearn.metrics import log_loss
print('log_loss_MNB', log_loss(Y_test, Y_pred3))
print('log_loss_BNB', log_loss(Y_test, Y_pred4))
print('log_loss_GNB', log_loss(Y_test, Y_pred5))

# **Neural Network - Backpropagation : Baseline model (without StandardScaler)**

In [None]:
# Load dataset
df = pd.read_csv('hotel_bookings_cleaned.csv', engine='python')
df.shape
df = df.dropna()
df.head(5)

In [None]:
#X: input_cols, Y: target_cols
input_cols = ['hotel', 'lead_time', 'arrival_date_week_number',
            'arrival_date_day_of_month', 'stays_in_weekend_nights',
            'stays_in_week_nights', 'adults', 'kids', 'meal',
            'market_segment', 'distribution_channel',
            'is_repeated_guest', 'previous_cancellations',
            'previous_bookings_not_canceled', 'reserved_room_type',
            'deposit_type',
            'customer_type',
            'required_car_parking_spaces', 'total_of_special_requests',
            'year', 'month','day']

target_cols = 'is_canceled'

train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_cols].copy()
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_cols].copy()
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_cols].copy()

In [None]:
X_train = train_inputs
X_val = val_inputs
X_test = test_inputs

Y_test = test_targets
Y_val = val_targets
Y_train = train_targets

In [None]:
X = train_inputs
y = train_targets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y, test_size=0.25)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

# Scaling the data
#ss = StandardScaler()
X_train_sc = X_train
X_test_sc = X_test

# Creating our model's structure
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(22,)))
model.add(Dropout(0.18))
model.add(Dense(12, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', patience=5) 

# Compiling the model
model.compile(loss='bce',
              optimizer='adam',
              metrics=['binary_accuracy'])

# Fitting the model
history = model.fit(X_train_sc,
                    y_train, 
                    batch_size = 256,
                    validation_data =(X_test_sc, y_test),
                    epochs = 50,
                    verbose = 0,
                    callbacks=[es])

In [None]:
# To check the train loss and test loss over epochs.
train_loss = history.history['loss']
test_loss = history.history['val_loss']

# Visualizing our training and testing loss by epoch
plt.figure(figsize=(10, 5))
plt.plot(train_loss, label='Training Loss', color='#185fad')
plt.plot(test_loss, label='Testing Loss', color='orange')
plt.title('Training and Testing Loss by Epoch', fontsize = 20)
plt.xlabel('Epoch', fontsize = 11)
plt.ylabel('Binary Crossentropy', fontsize = 11)
plt.legend(fontsize = 11);

In [None]:
# Visualizing our training and testing accuracy by epoch:
plt.figure(figsize=(10, 5))
plt.plot(history.history['binary_accuracy'], label='Training accuracy')
plt.plot(history.history['val_binary_accuracy'], label='Testing accuracy')
plt.title('Training and Testing Accuracy by Epoch', fontsize = 20)
plt.xlabel('Epoch', fontsize = 12)
plt.ylabel('Binary Accuracy', fontsize = 12)
plt.legend(fontsize = 12);

**To test if there is a Overfitting or Underfitting of data**

In [None]:
# Scoring
train_score = model.evaluate(X_train_sc,
                       y_train,
                       verbose=1)
test_score = model.evaluate(X_test_sc,
                       y_test,
                       verbose=1)
labels = model.metrics_names

print('')
print(f'Training Accuracy: {train_score[1]}')
print(f'Testing Accuracy: {test_score[1]}')

In [None]:
model.summary()

In [None]:
# To make predictions
Y_pred = model.predict(X_test_sc)

In [None]:
FP_rate3, TP_rate3, threshold3 = roc_curve(y_test, Y_pred)
print('roc_auc_score for Neural Network: ', roc_auc_score(y_test, Y_pred))

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - Neural Network')
plt.plot(FP_rate3, TP_rate3)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
Y_pred = model.predict(X_test_sc)
Y_pred = Y_pred.flatten()
print(Y_pred.round(2))

In [None]:
#to extract the predicted labels
y_pred = np.where(Y_pred > 0.5, 1, 0)
print(y_pred)

In [None]:
cf = confusion_matrix(y_test, y_pred, normalize='true')
plt.figure(figsize=(8, 6))
sns.heatmap(cf, annot=True)
plt.xlabel('Prediction')
plt.ylabel('Target')
plt.title('Confusion Matrix for Neural Network')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
accuracy = accuracy_score(y_pred, y_test)
print('Accuracy of Neural Network (Backpropagation): {:.5f}'.format(accuracy))

**R-squared score**

In [None]:
from sklearn.metrics import r2_score
print('r2_score:', r2_score(y_test, y_pred))

**Mean Squared Error**

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  

**Mean Absolute Error**

In [None]:
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

**Logarithmic Loss**

In [None]:
from sklearn.metrics import log_loss
print('log_loss', log_loss(y_test, y_pred))

# **Neural Network - Backpropagation with StandardScaler()**

In [None]:
# Load dataset
df = pd.read_csv('hotel_bookings_cleaned.csv', engine='python')
df.shape
df = df.dropna()
df.head(5)

In [None]:
#X: input_cols, Y: target_cols
input_cols = ['hotel', 'lead_time', 'arrival_date_week_number',
            'arrival_date_day_of_month', 'stays_in_weekend_nights',
            'stays_in_week_nights', 'adults', 'kids', 'meal',
            'market_segment', 'distribution_channel',
            'is_repeated_guest', 'previous_cancellations',
            'previous_bookings_not_canceled', 'reserved_room_type',
            'deposit_type',
            'customer_type',
            'required_car_parking_spaces', 'total_of_special_requests',
            'year', 'month','day']

target_cols = 'is_canceled'

train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_cols].copy()
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_cols].copy()
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_cols].copy()

In [None]:
X_train = train_inputs
X_val = val_inputs
X_test = test_inputs

Y_test = test_targets
Y_val = val_targets
Y_train = train_targets

In [None]:
X = train_inputs
y = train_targets

#To fit the data to the model
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y, test_size=0.25)

In [None]:
#To import module and library
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

# Scaling the data
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

# Creating our model's structure
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(22,)))
model.add(Dropout(0.18))
model.add(Dense(12, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(1, activation='sigmoid'))
es = EarlyStopping(monitor='val_loss', patience=5) 

# Compiling the model
model.compile(loss='bce',
              optimizer='adam',
              metrics=['binary_accuracy'])

# Fitting the model
history = model.fit(X_train_sc,
                    y_train, 
                    batch_size = 256,
                    validation_data =(X_test_sc, y_test),
                    epochs = 50,
                    verbose = 0,
                    callbacks=[es])

In [None]:
# To check the train loss and test loss over epochs.
train_loss = history.history['loss']
test_loss = history.history['val_loss']

# Visualizing our training and testing loss by epoch
plt.figure(figsize=(10, 5))
plt.plot(train_loss, label='Training Loss', color='#185fad')
plt.plot(test_loss, label='Testing Loss', color='orange')
plt.title('Training and Testing Loss by Epoch', fontsize = 20)
plt.xlabel('Epoch', fontsize = 11)
plt.ylabel('Binary Crossentropy', fontsize = 11)
plt.legend(fontsize = 11);

In [None]:
# Visualizing our training and testing accuracy by epoch:
plt.figure(figsize=(10, 5))
plt.plot(history.history['binary_accuracy'], label='Training accuracy')
plt.plot(history.history['val_binary_accuracy'], label='Testing accuracy')
plt.title('Training and Testing Accuracy by Epoch', fontsize = 20)
plt.xlabel('Epoch', fontsize = 12)
plt.ylabel('Binary Accuracy', fontsize = 12)
plt.legend(fontsize = 12);

**To test if there is a Overfitting or Underfitting of data**

In [None]:
# Scoring
train_score = model.evaluate(X_train_sc,
                       y_train,
                       verbose=1)
test_score = model.evaluate(X_test_sc,
                       y_test,
                       verbose=1)
labels = model.metrics_names

print('')
print(f'Training Accuracy: {train_score[1]}')
print(f'Testing Accuracy: {test_score[1]}')

In [None]:
model.summary()

In [None]:
# To make predictions
Y_pred = model.predict(X_test_sc)

In [None]:
FP_rate3, TP_rate3, threshold3 = roc_curve(y_test, Y_pred)
print('roc_auc_score for Neural Network: ', roc_auc_score(y_test, Y_pred))

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - Neural Network')
plt.plot(FP_rate3, TP_rate3)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
#To predict then flatten the array 
Y_pred = model.predict(X_test_sc)
Y_pred = Y_pred.flatten()
print(Y_pred.round(2))

In [None]:
#to extract the predicted labels
y_pred = np.where(Y_pred > 0.5, 1, 0)
print(y_pred)

In [None]:
cf = confusion_matrix(y_test, y_pred, normalize='true')
plt.figure(figsize=(8, 6))
sns.heatmap(cf, annot=True)
plt.xlabel('Prediction')
plt.ylabel('Target')
plt.title('Confusion Matrix for Neural Network')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
accuracy = accuracy_score(y_pred, y_test)
print('Accuracy of Neural Network (Backpropagation): {:.5f}'.format(accuracy))

**R-squared score**

In [None]:
from sklearn.metrics import r2_score
print('r2_score:', r2_score(y_test, y_pred))

**Mean Squared Error**

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  

**Mean Absolute Error**

In [None]:
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  

**Logarithmic Loss**

In [None]:
from sklearn.metrics import log_loss
print('log_loss', log_loss(y_test, y_pred))

# **Logistic Regression : Baseline model (without Scaler)**

In [None]:
#To import libraries
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# Load dataset
df = pd.read_csv('hotel_bookings_cleaned.csv', engine='python')
df.shape
df.head(5)

In [None]:
#To drop any null values
df = df.dropna()

In [None]:
#To split dataset into training, testing and validation dataset
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=0)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=0)

In [None]:
#To specify input variables (x) and target variable (y)
input_cols = ['hotel', 'lead_time', 'arrival_date_week_number',
            'arrival_date_day_of_month', 'stays_in_weekend_nights',
            'stays_in_week_nights', 'adults', 'kids', 'meal',
            'market_segment', 'distribution_channel',
            'is_repeated_guest', 'previous_cancellations',
            'previous_bookings_not_canceled', 'reserved_room_type',
            'deposit_type',
            'customer_type',
            'required_car_parking_spaces', 'total_of_special_requests',
            'year', 'month','day']

target_cols = 'is_canceled'

train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_cols].copy()
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_cols].copy()
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_cols].copy()

In [None]:
X_train = train_inputs
X_val = val_inputs
X_test = test_inputs

Y_test = test_targets
Y_val = val_targets
Y_train = train_targets

In [None]:
#To fit the data to the model
model2 = LogisticRegression(solver='liblinear')
model2.fit(X_train, Y_train)

In [None]:
#To predict
Y_pred = model2.predict(X_test)

In [None]:
#To get the accuracy score
accuracy = accuracy_score(Y_pred, Y_test)
print('Accuracy of Logistic Regression: {:.5f}'.format(accuracy))

In [None]:
#To predict and plot the confusion matrix
def predict_and_plot(inputs, targets, name=''):
    preds = model2.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))

    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

In [None]:
train_preds = predict_and_plot(X_train, train_targets, 'Train')
val_preds = predict_and_plot(X_val, val_targets, 'Validation')
test_preds = predict_and_plot(X_test, test_targets, 'Test')

In [None]:
#To get the classification report
from sklearn.metrics import classification_report
print(classification_report(Y_pred, Y_test))

In [None]:
#To obtain the ROC curve and AUC score
Y_score2 = model2.predict_proba(X_test)[:,1]
FP_rate2, TP_rate2, threshold2 = roc_curve(Y_test, Y_score2)
print('roc_auc_score for Logistic Regression: ', roc_auc_score(Y_test, Y_score2))

In [None]:
#To get the ROC curve
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - Logistic Regression')
plt.plot(FP_rate2, TP_rate2)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**Mean Squared Error**

In [None]:
#To get the mean squared error
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))  

**Mean Absolute Error**

In [None]:
#To get the mean absolute error
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))  

**R-Squared**

In [None]:
#To get the R-squared score
from sklearn.metrics import r2_score
print('r2_score_lng:', r2_score(Y_test, Y_pred))

**Logarithmic Loss**

In [None]:
#To get the log-loss
from sklearn.metrics import log_loss
print('log_loss', log_loss(Y_test, Y_pred))

**To test if there is Overfitting or Underfitting of data.**

In [None]:
print('Training set score: {:.4f}'.format(model2.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(model2.score(X_test, Y_test)))

# **Logistic Regression with MinMaxScaler()**

In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from IPython.display import Image 
from io import StringIO
import pydotplus
from sklearn import preprocessing
from sklearn import tree
%matplotlib inline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# Load dataset
df = pd.read_csv('hotel_bookings_cleaned.csv', engine='python')
df.shape
df.head(5)

In [None]:
df = df.dropna()

In [None]:
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=0)

In [None]:
#X: input_cols, Y: target_cols
input_cols = ['hotel', 'lead_time', 'arrival_date_week_number',
            'arrival_date_day_of_month', 'stays_in_weekend_nights',
            'stays_in_week_nights', 'adults', 'kids', 'meal',
            'market_segment', 'distribution_channel',
            'is_repeated_guest', 'previous_cancellations',
            'previous_bookings_not_canceled', 'reserved_room_type',
            'deposit_type',
            'customer_type',
            'required_car_parking_spaces', 'total_of_special_requests',
            'year', 'month','day']

target_cols = 'is_canceled'

train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_cols].copy()
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_cols].copy()
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_cols].copy()

In [None]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(df)
df = scaler.fit(df[numeric_cols])

In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
train_inputs = train_inputs[numeric_cols]
val_inputs = val_inputs[numeric_cols]
test_inputs = test_inputs[numeric_cols]

In [None]:
X_train = train_inputs
X_val =  val_inputs
X_test =  test_inputs
Y_test =  test_targets
Y_val =  val_targets
Y_train =  train_targets

In [None]:
model2 = LogisticRegression(solver='liblinear')

In [None]:
model2.fit(X_train, Y_train)

In [None]:
Y_pred = model2.predict(X_test)

In [None]:
accuracy = accuracy_score(Y_pred, Y_test)
print('Accuracy of Logistic Regression: {:.5f}'.format(accuracy))

In [None]:
def predict_and_plot(inputs, targets, name=''):
    preds = model2.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))

    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));
    
    return preds

In [None]:
train_preds = predict_and_plot(X_train, train_targets, 'Train')
val_preds = predict_and_plot(X_val, val_targets, 'Validation')
test_preds = predict_and_plot(X_test, test_targets, 'Test')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_pred, Y_test))

In [None]:
Y_score2 = model2.predict_proba(X_test)[:,1]
FP_rate2, TP_rate2, threshold2 = roc_curve(Y_test, Y_score2)
print('roc_auc_score for Logistic Regression: ', roc_auc_score(Y_test, Y_score2))

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - Logistic Regression')
plt.plot(FP_rate2, TP_rate2)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**Mean Squared Error**

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))  

**Mean Absolute Error**

In [None]:
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))  

**R-Squared**

In [None]:
from sklearn.metrics import r2_score
print('r2_score_lng:', r2_score(Y_test, Y_pred))

**Logarithmic Loss**

In [None]:
from sklearn.metrics import log_loss
print('log_loss', log_loss(Y_test, Y_pred))

**To test if there is Overfitting or Underfitting of data.**

In [None]:
print('Training set score: {:.4f}'.format(model2.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(model2.score(X_test, Y_test)))

# **Display Graphs for Comparison**

In [None]:
# Creating a dataframe to compare our models' performances:
model_scores = pd.DataFrame(data=[('Decision Tree', 0.95465, 0.95151, 0.04534, 0.04534,0.80588),
                                  ('Naive Bayes', 0.76417,  0.80400,0.23583, 0.23583, -0.00963),
                                  ('Backpropagation NN', 0.99395, 0.99961, 0.00784, 0.00605, 0.97407),
                                  ('Logistic Regression', 0.81216, 0.95150, 0.18784, 0.18784, 0.19582)], 
                            columns=['Model', 'Accuracy', 'AUC','MAE','MSE','R2'])
model_scores

In [None]:
# Visualizing testing accuracy of each model: 
plt.style.use('default')
plt.figure(figsize=(10,5))
p=sns.barplot(x='Model', y='Accuracy', data=model_scores, palette='Accent')
plt.title('Classification Accuracy For Each Model', fontsize = 15)
plt.xlabel('Model', fontsize = 12)
plt.ylabel('Classification Accuracy', fontsize = 12)
for index, row in model_scores.iterrows():
    p.text(x=row.name, y=row.Accuracy, s=round(row.Accuracy,2), color='black', horizontalalignment='center');

In [None]:
#To get the comparison graph for each model
import pandas as pd
import matplotlib.pyplot as plt

ax = plt.gca()
model_scores.plot(x = 'Model', y = 'Accuracy', ax = ax)
model_scores.plot(x = 'Model', y = 'AUC', ax = ax)
model_scores.plot(x = 'Model', y = 'MAE', ax = ax)
model_scores.plot(x = 'Model', y = 'MSE', ax = ax)
model_scores.plot(x = 'Model', y = 'R2', ax = ax)

In [None]:
# Creating a dataframe to compare our models' performances on Log Loss:
model_logloss = pd.DataFrame(data=[('Decision Tree', 1.45851),
                                  ('Naive Bayes', 7.73437),
                                  ('Backpropagation NN', 0.62017),
                                  ('Logistic Regression', 6.42207)], 
                            columns=['Model', 'Logarithmic Loss'])
model_logloss

In [None]:
# Visualizing the log-loss of each model: 
plt.style.use('default')
plt.figure(figsize=(10,5))
p=sns.barplot(x='Model', y='Logarithmic Loss', data=model_logloss, palette='CMRmap_r')
plt.title('Logarithmic Loss For Each Model', fontsize = 15)
plt.xlabel('Model', fontsize = 12)
plt.ylabel('Logarithmic Loss', fontsize = 12)
plt.show()

In [None]:
# Creating a dataframe to compare our models' performances:
model_scores = pd.DataFrame(data=[('Training Set', 1.0000, 0.7690, 0.9950, 0.8121),
                                  ('Testing Set', 0.9547,  0.7642,0.9940, 0.8122)], 
                            columns=['Set|Model', 'Decision Tree', 'Naive Bayes','Backpropagation NN','Logistic Regression'])
model_scores

In [None]:
# Creating a dataframe to compare our models' performances:
model_scores = pd.DataFrame(data=[('Decision Tree', 0.95465, 0.95, 0.95, 0.95),
                                  ('Naive Bayes', 0.88,  0.80400,0.76, 0.79),
                                  ('Backpropagation NN', 0.99395, 0.99, 0.99, 0.99),
                                  ('Logistic Regression', 0.81216, 0.86, 0.81, 0.82)], 
                            columns=['Model', 'Accuracy', 'Precision','Recall','F1 Score'])
model_scores

In [None]:
#To get the comparison graph for classification report for each model
import pandas as pd
import matplotlib.pyplot as plt

ax = plt.gca()
model_scores.plot(x = 'Model', y = 'Accuracy', ax = ax)
model_scores.plot(x = 'Model', y = 'Precision', ax = ax)
model_scores.plot(x = 'Model', y = 'Recall', ax = ax)
model_scores.plot(x = 'Model', y = 'F1 Score', ax = ax)