# Exercise #

In this exercise, you'll build a model to predict hotel cancellations with a binary classifier.

In [2]:
# Setup plotting
import matplotlib.pyplot as plt
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)
plt.rc('animation', html='html5')

import numpy as np

import keras_tuner
import tensorflow as tf
from tensorflow.keras import layers

First, load the *Hotel Cancellations* dataset. (**fix the path** if necessary)

In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

df = pd.read_csv('../data/hotel_train.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95512 entries, 0 to 95511
Data columns (total 30 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           95512 non-null  object 
 1   is_canceled                     95512 non-null  int64  
 2   lead_time                       95512 non-null  int64  
 3   arrival_date_year               95512 non-null  int64  
 4   arrival_date_month              95512 non-null  object 
 5   arrival_date_week_number        95512 non-null  int64  
 6   arrival_date_day_of_month       95512 non-null  int64  
 7   stays_in_weekend_nights         95512 non-null  int64  
 8   stays_in_week_nights            95512 non-null  int64  
 9   adults                          95512 non-null  int64  
 10  children                        95508 non-null  float64
 11  babies                          95512 non-null  int64  
 12  meal                            

In [5]:
numerical = df.loc[:, (df.dtypes == int) | (df.dtypes == float)].columns.tolist()
categorical = df.loc[:, (df.dtypes != int) & (df.dtypes != float)].columns.tolist()

In [6]:
df.is_canceled.value_counts()

is_canceled
0    60133
1    35379
Name: count, dtype: int64

We can note that ADR (average daily rate) has negative values

In [7]:
df = df.loc[df.adr > 0,:]

We also note that reservations exist with 0 adults...

In [8]:
df = df.loc[df.adults > 0,:]

## Prepare data for ML

In [9]:
X = df.copy()
y = X.pop('is_canceled') # this is the target column


X['arrival_date_month'] = \
    X['arrival_date_month'].map(
        {'January':1, 'February': 2, 'March':3,
         'April':4, 'May':5, 'June':6, 'July':7,
         'August':8, 'September':9, 'October':10,
         'November':11, 'December':12}
    )

#X['meal'] = X['meal'].replace(['Undefined'], 'SC')
X["international"] = X.country=="PRT"
X.drop(columns=[ "country"], inplace=True)

X['kids'] = X['babies'] + X['children']
X['total_stay'] = X['stays_in_week_nights'] + X['stays_in_weekend_nights']
X['prev_canc'] = (X['previous_cancellations']>0).astype(int)
X.drop(columns=["babies","children","meal"], inplace=True)


numerical_features = X.loc[:, (X.dtypes == int) | (X.dtypes == float)].columns.tolist()
categorical_features = X.loc[:, (X.dtypes != int) & (X.dtypes != float)].columns.tolist()

transformer_num = make_pipeline(
    SimpleImputer(strategy="constant"), # default fill value=0
    StandardScaler(),
)
transformer_cat = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, numerical_features),
    (transformer_cat, categorical_features),
)

# stratify - make sure classes are evenlly represented across splits
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, train_size=0.8)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

input_shape = [X_train.shape[1]]
input_shape

[64]

In [42]:

def build_model(hp):
    model = keras.Sequential()

    dropoutRate = hp.Float("drRate", min_value=0.1, max_value=0.3, step=0.1)
    
    model.add(layers.Dense(
            units=hp.Int("units", min_value=50, max_value=150, step=15),
            activation=hp.Choice("activation", ["relu", "leaky_relu", "tanh"]))
    )
    if hp.Boolean("dropout"):
        model.add(layers.Dropout(rate=dropoutRate))
        
    model.add(layers.Dense(
            units=hp.Int("units2", min_value=20, max_value=60, step=10),
            activation=hp.Choice("activation2", ["relu","leaky_relu",  "tanh"]))
    )
    if hp.Boolean("dropout2"):
        model.add(layers.Dropout(rate=dropoutRate))
        
    if hp.Boolean("extraLayer"):
        model.add(keras.layers.Dense(units=hp.Int("units3", min_value=5, max_value=20, step=5), activation=hp.Choice("activation3", ["relu","leaky_relu",  "tanh"])))
        
    model.add(keras.layers.Dense(1, activation="sigmoid"))
    

        
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
     
    return model


build_model(keras_tuner.HyperParameters())


<keras.engine.sequential.Sequential at 0x7f34b50f67a0>

In [43]:
tuner = keras_tuner.Hyperband(
    hypermodel=build_model,
    objective="val_accuracy",
    overwrite=True,
    directory="/tmp/kerastuner",
    project_name="hotel",
)
tuner.search_space_summary()

tuner.search(X_train, y_train, validation_data=(X_valid, y_valid))

Trial 253 Complete [00h 03m 01s]
val_accuracy: 0.8699061274528503

Best val_accuracy So Far: 0.8760401010513306
Total elapsed time: 01h 20m 31s

Search: Running Trial #254

Value             |Best Value So Far |Hyperparameter
0.1               |0.2               |drRate
125               |140               |units
tanh              |relu              |activation
False             |True              |dropout
20                |60                |units2
tanh              |leaky_relu        |activation2
False             |False             |dropout2
False             |False             |extraLayer
20                |20                |units3
leaky_relu        |leaky_relu        |activation3
100               |100               |tuner/epochs
0                 |34                |tuner/initial_epoch
0                 |3                 |tuner/bracket
0                 |3                 |tuner/round

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/

KeyboardInterrupt: 

In [16]:
def final_model():
    model = tf.keras.Sequential()

    model.add(layers.Dense(units=140, activation="relu"))
    model.add(layers.Dropout(0.2))    
    model.add(layers.Dense(units=60,activation="leaky_relu"))
    model.add(layers.Dense(1, activation="sigmoid"))
    

        
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
     
    return model

m = final_model()
m.fit(X_train, y_train, validation_split=0.15, epochs=100, callbacks=[tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])

AttributeError: module 'tensorflow' has no attribute 'set_seed'

In [17]:
# Evaluation on test set
df2 = pd.read_csv('../data/hotel_test.csv')

X = df2.copy()
y_test = X.pop('is_canceled') # this is the target column


X['arrival_date_month'] = \
    X['arrival_date_month'].map(
        {'January':1, 'February': 2, 'March':3,
         'April':4, 'May':5, 'June':6, 'July':7,
         'August':8, 'September':9, 'October':10,
         'November':11, 'December':12}
    )

#X['meal'] = X['meal'].replace(['Undefined'], 'SC')
X["international"] = X.country=="PRT"
X.drop(columns=[ "country"], inplace=True)

X['kids'] = X['babies'] + X['children']
X['total_stay'] = X['stays_in_week_nights'] + X['stays_in_weekend_nights']
X['prev_canc'] = (X['previous_cancellations']>0).astype(int)
X.drop(columns=["babies","children","meal"], inplace=True)


X_test = preprocessor.transform(X)

In [18]:
m.evaluate(X_test, y_test)



[0.2911970615386963, 0.8733981251716614]