In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.cluster import AgglomerativeClustering, KMeans
from scipy.cluster.hierarchy import dendrogram, linkage


----------------

In [17]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Can we predict if a customer will cancel their bookings?

In [42]:
bookings = pd.read_csv('hotel_bookings.csv')
bookings.isna().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

Removing columns that have a lot of missing values, and reservation_status which seems to be related to the label.  Also removing values that will grow the feature list by a lot when we one hot encode the categorical features, such as country, meal, arrival_date, and reservation_status_date.

In [43]:
bookings.drop(['agent', 'company', 'reservation_status', 'reservation_status_date', 'meal', 'arrival_date_month', 'country'], axis= 1, inplace= True)
bookings.dropna(inplace= True)
bookings = bookings.reset_index(drop= True)
bookings.dtypes



hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
days_in_waiting_list                int64
customer_type                      object
adr                               float64
required_car_parking_spaces       

In [44]:
X = bookings.drop(['is_canceled'], axis= 1)
y = bookings['is_canceled']
X_df = X.select_dtypes(['number'])

In [45]:

X_df = pd.DataFrame(scaler.fit_transform(X_df), columns= X_df.columns)


In [46]:
X = pd.concat([X_df, pd.get_dummies(X.select_dtypes(['object']))], axis= 1)

In [55]:
X.shape

(119386, 60)

60 Features sounds great, but too much for the model to handle.  Run through PCA

In [49]:
pca = PCA(n_components = 0.99)

X_pca = pca.fit_transform(X)

pca.n_components_

28

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size= 0.30)

In [53]:
log_reg = sm.Logit(y_train, X_train).fit()

print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.464062
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:            is_canceled   No. Observations:                83570
Model:                          Logit   Df Residuals:                    83542
Method:                           MLE   Df Model:                           27
Date:                Fri, 12 Mar 2021   Pseudo R-squ.:                  0.2951
Time:                        19:07:03   Log-Likelihood:                -38782.
converged:                       True   LL-Null:                       -55020.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -1.6080      0.017    -96.294      0.000      -1.641      -1.575
x2            -0.5552      0.

In [54]:
y_pred = list(map(round, log_reg.predict(X_test)))
accuracy_score(y_pred, y_test)


0.7900658923386196

In [56]:
y_pred = list(map(round, log_reg.predict(X_train)))
accuracy_score(y_pred, y_train)

0.7872801244465717

The model is getting about 80% accurate.