# **Hotel booking demand**

In [259]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import xgboost as xgb

from sklearn import metrics

In [260]:
# Importing data
df = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')
df.head(10)

In [261]:
df.shape

## Data analysis

In [262]:
# Exploring descriptive statistics:
# Count number of non-NA/null observations.
# Maximum of the values in the object.
# Minimum of the values in the object.
# Mean of the values.Standard deviation of the observations.
# Subset of a DataFrame including/excluding columns based on their dtype.

df.describe()

In [263]:
# Cancellations by type of hotel (taken and modified from https://www.kaggle.com/vssseel/eda-various-ml-models-and-nn-with-roc-curves)
sns.set(style = "whitegrid")
plt.title("Canceled by hotel type", fontdict = {'fontsize': 20})
ax = sns.countplot(x = "is_canceled", hue = 'hotel',  data = df, palette=["#247ba0","#cbe7f3"])

In [264]:
# Canceled or not in total number of reservations
sns.set(style = "whitegrid")
plt.title("Canceled or not", fontdict = {'fontsize': 20})
ax = sns.countplot(x = "is_canceled", hue = 'is_canceled',  data = df, palette=["#247ba0","#cbe7f3"])

In [265]:
# number of arrivals monthly for both resort and city hotel
plt.figure(figsize =(15,10))
sns.set(style="whitegrid")
plt.title("Total Customers by month", fontdict={'fontsize': 20})
Months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
ax = sns.countplot(x = "arrival_date_month", hue = 'hotel', data = df, order=Months, palette=["#247ba0","#cbe7f3"])


As we can see, both hotels have the most number of guests during summer - in July and August.  

In [266]:
# Cancellation by month of arrival for both resort hotel and city hotel

plt.figure(figsize = (15,10))
Months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
sns.barplot(x = 'arrival_date_month', y = 'is_canceled', hue = 'hotel', data = df, order=Months, palette=["#247ba0","#cbe7f3"])

In [267]:
total_stays = df['stays_in_week_nights']+df['stays_in_weekend_nights']

In [268]:
# Number of nights guests stay in resort and city hotel

sns.set(style = "whitegrid")
plt.figure(figsize = (40,10))
plt.title("", fontdict = {'fontsize': 20})
ax = sns.countplot(x = total_stays, hue = 'hotel',  data = df, palette=["#247ba0","#cbe7f3"])

In the graph above we can see that in the city hotel people tend to stay more 1 to 4 night, where in resort hotel people tend to stay longer.

In [269]:

sns.set(style = "whitegrid")
plt.figure(figsize = (10,10))
plt.title("market", fontdict = {'fontsize': 20})
ax = sns.countplot(x = 'is_repeated_guest', hue = 'is_canceled',  data = df, palette=["#247ba0","#cbe7f3"])


## Data processing

In [270]:
#Exploring data types
df.dtypes

In [271]:
# Exploring null values 
df.isnull().sum()

We can see that there are missing values in the columns: children, country, agent and company. For these columns percentage of missing values in each column will be checked below. 

In [272]:
# percentage of null values
df.isnull().mean() * 100

In [273]:
df.info()

We can see that there are 94.3% missing values in column 'company' (ID of the company/entity that made the booking or responsible for paying the booking. ID is presented instead of designation for anonymity reasons) and because of that I will drop the column as it is seems unecessary to keep it for further work. 

There are 4 missing values in column 'children', which I will fill out with value 0 taking in consideration that those guests do not have children. 

In the column 'agent' (ID of the travel agency that made the booking), there are 16340 missing values, or 13.69%  of total values. This is probably because the guest didn't make reservation through agency. I will probably fill out those values with random ID "999" in the code below.

In the column 'country' there are 0.41% missing values for which I will probably drop the data.

In [274]:
# remove column 'company'
df = df.drop(['company'], axis = 1)

In [275]:
# edit column 'agent' to replace null values with ID "999"

df.loc[df.agent.isnull(), 'agent'] = 999 

In [276]:
# edit column 'children' to replace null values with value "0"

df.loc[df.children.isnull(), 'children'] = 0

In [277]:
# drop row where there are null values in column 'country'
df = df.dropna(subset=['country'])

In [278]:
# Checking null values
df.isnull().sum()

In [279]:
## Plot the heatmap to see correlation with columns
fig, ax = plt.subplots(figsize=(22,15))

sns.heatmap(df.corr(), annot=True, ax=ax, cmap=sns.diverging_palette(232, 91, n=200));

In [280]:
correlation = df.corr()['is_canceled'].abs().sort_values(ascending = False)
correlation

In [281]:
# df2=df[['is_canceled', 'lead_time', 'total_of_special_requests', 'required_car_parking_spaces', 'booking_changes', 'agent', 'previous_cancellations', 'is_repeated_guest' ]]

In [282]:
# Dropping not useful columns

# useless_columns = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_week_number', 'assigned_room_type', 'arrival_date_day_of_month']

# df.drop(useless_columns, axis = 1, inplace = True)

In [283]:
from sklearn.preprocessing import LabelEncoder
lc=LabelEncoder()
df['month']=lc.fit_transform(df.arrival_date_month)

In [284]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])

df['year'] = df['reservation_status_date'].dt.year
df['month'] = df['reservation_status_date'].dt.month
df['day'] = df['reservation_status_date'].dt.day

In [285]:
dummy=pd.get_dummies(df[['hotel','meal','country', 'market_segment', 'distribution_channel','reserved_room_type',
       'assigned_room_type','deposit_type','customer_type','reservation_status']],drop_first=True)
pd.set_option('display.max_columns',None)
dummy.head()

In [286]:
frames = df.drop(['hotel','meal','country', 'market_segment', 'distribution_channel','reserved_room_type',
       'assigned_room_type','deposit_type','customer_type','reservation_status','arrival_date_month', 'reservation_status_date'], axis=1)
result = pd.concat([frames, dummy], axis=1)

In [287]:
result

## Splitting the dataset

In [288]:
# Splitting the dataset 70:30 using sklearn method train_test_split()
y = result["is_canceled"]
X = result.drop(["is_canceled"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [289]:
X_train.head()

In [290]:
X_test.head()

In [291]:
y_train.head(), y_test.head()

## Model Selection

* RandomForestClassifier - randomly selects observations (bootstrap samples) and a subset of the features from the train data and constructs a decision tree for every sample. From each decision tree, it will get the prediction results and based on the majority votes of predictions, it averages the results to predict the final output
* KNeighborsClassifier
* XGBoostClassifier - builds multiple trees on top of each other to correct the prediction errors of the previous tree
* AdaBoostClassifier 

For models, the tutorials from https://www.datacamp.com/ were taken and modified

In [292]:
# RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [305]:
# KNeighborsClassifier

# 1. Create KKN Classifier 
model = KNeighborsClassifier()

# 2. Train model
model.fit(X_train, y_train)

# 3. Predict the response
predicted= model.predict(X_test) 

#4. Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test, predicted))

In [306]:
# XGBoostClassifier
# 1. Train XGBoost Classifer
model = XGBClassifier(eval_metric='mlogloss')
model.fit(X_train, y_train)

# 2. Predict the response for test datasest 
y_pred = model.predict(X_test)

# 3. Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [295]:
# AdaBoostClassifier
# 1. Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# 2. Train Adaboost Classifer
model = abc.fit(X_train, y_train)

# 3.Predict the response for test dataset
y_pred = model.predict(X_test)

# 4. Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [303]:
# Gradient Boosting Classifier
#1. Create Gradient Boosting Classifier object
gb = GradientBoostingClassifier()

# 2. Train Gradient Boosting Classifier
gb.fit(X_train, y_train)

# 3.Predict the response for test dataset
y_pred = gb.predict(X_test)

# 4. Model accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))