# Hotel Cancellation Forecast - Project
The accomodation industry is a 4.1 Trillion Dollar industry in 2021.
In today's fast-paced world, consumers are becoming more flexible with their stays, "Free Cancellation" offers are helping large booking websites like Booking.com and Hotels.com stay competitive by allowing consumers said flexibility.
However, these offers bring an old-new problem to the table - booking cancellations.
In this project we aim to allow accurate forecasting of booking cancellations in order to aid hotels and booking websites correctly anticipate hotel cancellations and act accordingly to prevent loss and maximize capacity.
##### By Oriel Perets & Dafna Meron


-------

### Project setup
#### Importing dependecies:
1. Core dependecies
    * Numpy
    * Pandas
2. Model dependecies (Scikit Learn)
    * classification
    * cross_val_score
    * RandomForestClassifier
    * Logistic Regressions
    * GaussianNB
    * Model_selection tools
3. Importing data
    * csv --> dataFrame


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('hotel_bookings.csv')

-------

### Data exploration
1. Variable Identification
2. Univariate Analysis
3. Bi-Variate Analysis
4. Missing Values Treatment
5. Outlier Treatment
6. Variable Transofrmation
7. Variable Creation

In [None]:
# Variable identification
df.info()
df.head(50)

#### Variable identification
* Target - 'is_canceled'
* Predictos - all other variables
* Continous:
    * total_of_special_requests
    * required_car_parking_spaces
    * adr
    * days_in_waiting_list
    * adults
    * stays_in_week_nights
    * stays_in_weekend_nights
    * lead_time
* Categorical:
    * hotel
    * arrival_date_month
    * arrival_date_day_of_month
    * arrival_date_week_number
    * deposit_type
    * agent
    * company
    * customer_type    
* Useless (removed):
    * reservation_status
    * reservation_status_date

--------------

In [None]:
# Univariate analysis
cont_vars = ['total_of_special_requests','required_car_parking_spaces','adr','days_in_waiting_list','adults','stays_in_week_nights','stays_in_weekend_nights','lead_time'
]
# Continous variables
fig = plt.figure()
fig.subplots_adjust(hspace=0.5, wspace=0.5)
fig.set_size_inches(14,14)
index = 0
plot_idx = 1
for var in cont_vars:
    col = cont_vars[index]
    ax = fig.add_subplot(3, 3, plot_idx)
    sns.histplot(df[col], ax=ax)
    index+=1
    plot_idx+=1
plt.show()



-------------------

In [None]:
# Univariate analysis
cat_vars = ['hotel','arrival_date_month','arrival_date_day_of_month','arrival_date_week_number','deposit_type','agent','company','customer_type'] 
# Categorical variables
fig = plt.figure()
fig.subplots_adjust(hspace=0.5, wspace=0.5)
fig.set_size_inches(14,14)
index = 0
plot_idx = 1
for var in cat_vars:
    col = cat_vars[index]
    ax = fig.add_subplot(3, 3, plot_idx)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
    sns.countplot(df[col], ax=ax)
    index+=1
    plot_idx+=1
plt.show()


----------

In [None]:
# Missing Values
print(df.isna().sum())
# Agent, Company - too many missing values
df = df.drop('company',1)
df = df.drop('agent',1)
# Country - fill with 'unknown'
df['country'].fillna('unknown', inplace=True)
# babies - fill with median
df['country'].fillna(df['babies'].median(), inplace=True)

----------

In [None]:
# Variable Transformation
# lead_time -> convert to intervals
df['lead_time'].mean()
ax = sns.distplot(df['lead_time'])
# Intervals -> 0-100 | 100-200 | 200+
lead_time = df['lead_time']
converted = []
for lt in lead_time:
    if lt >= 0 and lt <= 100:
        converted.append(0)
    if lt > 100 and lt < 200:
        converted.append(0)
    if lt >= 200:
        converted.append(0)
# Push to DF
df['t_lead_time'] = converted

In [None]:
# required_car_parking_space -> Binary
df['required_car_parking_spaces'].hist()
car_parking = df['required_car_parking_spaces']
converted = []
for cp in car_parking:
    if cp == 0:
        converted.append(0)
    else:
        converted.append(1)
# Push to DF
df['t_required_car_parking_spaces'] = converted

In [None]:
# arrival_month -> convert to Integer
month = df['arrival_date_month']
# Map months to integers
dct = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}
converted = list(map(dct.get, month))
# Push to DF
df['t_arrival_month'] = converted

In [None]:
# hotel -> Binary
df['hotel'].hist()
hotel = df['hotel']
# Map hotel to binary
dct = {'City Hotel':0, 'Resort Hotel':1}
converted = list(map(dct.get, hotel))
# Push to DF
df['t_hotel'] = converted

In [None]:
# deposit_type --> Intergers
df['deposit_type'].hist()
deposit = df['deposit_type']
# Map deposit to integers
dct = {'No Deposit' : 0, 'Refundable': 1, 'Non Refundable': 2}
converted = list(map(dct.get, deposit))
# Push to DF
df['t_deposit_type'] = converted

In [None]:
# customer_type -> Integers
df['customer_type'].hist()
customer_type = df['customer_type']
dct = {'Transient' : 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3}
converted = list(map(dct.get, customer_type))
# Push to DF
df['t_deposit_type'] = converted

In [None]:
# dist_channel -> Integers
df['distribution_channel'].hist()
channel = df['distribution_channel']
# undefined included into TA/TO for size considerations
dct = {'TA/TO':1 ,'Undefined': 1, 'Corporate': 2, 'Direct': 3, 'GDS': 4, 'Undefined': 5}
converted = list(map(dct.get, channel))
df['t_dist_channel'] = converted

In [None]:
# adr -> binning and labeling as 0,1,2,3
labels = [0,1,2,3]
df['t_average_daily_rate'] = pd.qcut(df['adr'], q=4, labels=labels)

In [None]:
# previous_cancellations -> Binary
prev_can = df['previous_cancellations']
df['previous_cancellations'].value_counts()
converted = []
for pc in prev_can:
    if pc == 0:
        converted.append(0)
    elif pc == 1:
        converted.append(1)
    elif pc > 1 and pc < 5:
        converted.append(2)
    else: #serial canceler
        converted.append(3)
df['t_prev_cancellations'] = converted

-------------

In [None]:
# Variable Creation
# arrival_season -> the season in which the customer arrives @ the hotel
month = df['arrival_date_month']
# months to numbers map
dct = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}
t_month = list(map(dct.get, month))
# Map months to seasons
seasons = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]
season_dct = dict(zip(range(1,13), seasons))
t_seasons = list(map(season_dct.get, t_month))
df['t_arrival_season'] = t_seasons
df['t_arrival_season'].hist()

In [None]:
# party_size ->  number of adults + children + babies
party = df['adults'] + df['children'] + df['babies']
party.fillna(0, inplace=True)
df['t_party'] = party

# df['t_party'] = df['t_party'].astype(int)
sns.histplot(df['t_party'])

-----------

## Model 

In [None]:
df.info()

In [None]:
# Splitting with sklearn train_test_split
t_cols = ['t_average_daily_rate','t_prev_cancellations', 't_hotel', 't_lead_time','t_deposit_type','previous_bookings_not_canceled','t_party','t_required_car_parking_spaces']
X_train, X_test, y_train, y_test = train_test_split(df[t_cols], df['is_canceled'], test_size=0.2, random_state=42)

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
m = RandomForestClassifier(n_estimators = 200, random_state = 0)

In [None]:
# # Naive Bayes
# from sklearn.naive_bayes import GaussianNB
# m = GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
# # Logistic Regression
# from sklearn.linear_model import LogisticRegression  
# m = LogisticRegression(penalty='l2')

In [None]:
m.fit(X_train, y_train)
y_pred = m.predict(X_test)

# from sklearn.metrics import confusion_matrix
# cm = confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#### Model report:
* f1 weighted - .63 - low 1's higher 0's
* high recall for 0
* low recall for 1
* medium precision for 1's and 0's

In [None]:
# preparing submission file
submission = pd.DataFrame( { 'PassengerId': test['PassengerId'] , 'Survived': yy } )
submission.to_csv('naive_bayes_model.csv' , index = False )

In [None]:
 # Evalutating the model
from sklearn.model_selection import cross_val_score
scores = cross_val_score(m, X_train, y_train , cv = 10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))