In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from keras import models
from keras import layers

In [3]:
df = pd.read_csv("hotel_bookings.csv")
portugal_holiday = pd.read_csv("portugal_holiday.csv", sep=';')

In [5]:
month_dict = {
    'January' : 1,
    'February' : 2,
    'March' : 3,
    'April' : 4,
    'May' : 5,
    'June' : 6,
    'July' : 7,
    'August' : 8,
    'September' : 9,
    'October' : 10,
    'November' : 11,
    'December' : 12
}

df['arrival_date_month'] = df['arrival_date_month'].apply(lambda x: month_dict[x])
df.rename(columns={"arrival_date_year": "year", "arrival_date_month": "month", 'arrival_date_week_number':'week', 'arrival_date_day_of_month':'day'}, inplace=True)
df['full_date'] = pd.to_datetime(df[['year', 'month', 'day']])
df['week_date'] = df['year'].astype(str) +'/' + df['full_date'].dt.week.astype(str)
df['month_date'] = df['year'].astype(str) + '/' + df['full_date'].dt.month.astype(str)

In [6]:
awr = df[['week_date', 'adr']].groupby(by='week_date').mean()
amr = df[['month_date', 'adr']].groupby(by='month_date').mean()
df['awr'] = df['week_date'].apply(lambda date: awr.loc[date,'adr'])
df['amr'] = df['month_date'].apply(lambda date: amr.loc[date,'adr'])
df['adr_min_awr'] = df['adr'] - df['awr']
df['adr_min_amr'] = df['adr'] - df['amr']

In [7]:
portugal_holiday = pd.to_datetime(portugal_holiday)
holiday_list = portugal_holiday.tolist()
portugal_week = portugal_holiday.dt.year.astype(str) + '/' + portugal_holiday.dt.week.astype(str)
holiday_week = portugal_week.tolist()

In [8]:
df['is_holiday_day'] = df['full_date'].apply(lambda date : 1 if date in holiday_list else 0)
df['is_holiday_week'] = df['week_date'].apply(lambda date : 1 if date in holiday_week else 0)
df['reservation_date'] = df['full_date'] - pd.to_timedelta(df['lead_time'], unit='d')
df['is_room_change'] = df['reserved_room_type'] != df['assigned_room_type']
df['is_weekend'] = df['full_date'].dt.weekday >= 5
df['is_booking_changes'] = df['booking_changes'] > 0
df['is_request'] = df['total_of_special_requests'] > 0
df['is_company_null'] = df['company'].isnull()
df['is_agent_null'] = df['agent'].isnull()
df['children'] = df['children'].fillna(df['children'].mode()[0])
df['is_portugal'] = df['country'] == 'PRT'
df['cancel_ratio'] = df['previous_cancellations']/(df['previous_cancellations']+df['previous_bookings_not_canceled'])
df['total_person'] = df['adults'] + df['children'] + df['babies']

In [9]:
df['cancel_ratio'] = df['cancel_ratio'].fillna(0)

In [10]:
binning_list = np.arange(-1, 751, 25)
df['lead_binned'] = pd.cut(df['lead_time'], binning_list)
df['is_not_canceled'] = df['is_canceled'].apply(lambda x : 1 if x == 0 else 0)

In [11]:
df_ready = df.drop(['week', 'country', 'full_date', 'week_date', 'month_date', 'reservation_date', 'is_not_canceled', 'reservation_status_date', 'reservation_status', 'agent', 'company'], axis=1)
df_ready = df_ready.astype({'is_portugal':int, 'is_room_change':int, 'is_weekend':int, 'is_booking_changes':int, 'is_request':int, 'is_company_null':int, 'is_agent_null':int})

In [15]:
le = preprocessing.LabelEncoder()
le.fit(df_ready['lead_binned'])
df_ready['lead_binned'] = le.transform(df_ready['lead_binned'])

le = preprocessing.LabelEncoder()
le.fit(df_ready['reserved_room_type'])
df_ready['reserved_room_type'] = le.transform(df_ready['reserved_room_type'])

le = preprocessing.LabelEncoder()
le.fit(df_ready['assigned_room_type'])
df_ready['assigned_room_type'] = le.transform(df_ready['assigned_room_type'])

In [16]:
ohc_df = pd.get_dummies(df_ready[['hotel', 'meal', 'market_segment', 'distribution_channel', 'deposit_type', 'customer_type']], drop_first=True)
df_ready.drop(['hotel', 'meal', 'market_segment', 'distribution_channel', 'deposit_type', 'customer_type'], axis=1, inplace=True)
df_ready = pd.concat([df_ready, ohc_df], axis=1)

In [50]:
X = df_ready.drop(['is_canceled'], axis=1).to_numpy()
scaler = preprocessing.StandardScaler()
X_standardized = scaler.fit_transform(X)
y = df_ready['is_canceled'].to_numpy()

In [51]:
print("Mean Before Standardized:", round(X[:,0].mean()))
print("Standard deviation Before Standardized:", X[:,0].std())

print("\nMean After Standardized:", round(X_standardized[:,0].mean()))
print("Standard deviation After Standardized:", X_standardized[:,0].std())

Mean Before Standardized: 104.0
Standard deviation Before Standardized: 106.86264950916215

Mean After Standardized: 0.0
Standard deviation After Standardized: 1.0


In [52]:
network = models.Sequential()
network.add(layers.Dense(units=16, activation="relu", input_shape=(X.shape[1],)))
network.add(layers.Dense(units=16, activation="relu"))
network.add(layers.Dense(units=1, activation="sigmoid"))
network.compile(loss="binary_crossentropy",
                optimizer="adam", 
                metrics=["accuracy"]) 

In [58]:
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
network_score = []
for train_idx, test_idx in cv.split(X_standardized,y):
    network.fit(X[train_idx], y[train_idx], 
                epochs=10, # Number of epochs
                verbose=1, # Print description after each epoch
                batch_size=100)
    y_pred = network.predict(X[test_idx])
    network_score.append(accuracy_score(y[test_idx], y_pred.round()))
    print(network_score[-1])

network_score = np.array(network_score)
print("\nAccuracy mean:", network_score.mean())
print("Accuracy std:",network_score.std())

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.8096235185728046
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.8136778624675434
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.833235614373063
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.835162073875534
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.8313020898772878

Accuracy mean: 0.8246002318332465
Accuracy std: 0.010720422872480624
