In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# matplotlib setup
SMALL_SIZE = 8
MEDIUM_SIZE = 11
LARGE_SIZE = 14

plt.rc('font', size=MEDIUM_SIZE)
plt.rc('axes', titlesize=MEDIUM_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rc('xtick', labelsize=SMALL_SIZE)
plt.rc('ytick', labelsize=SMALL_SIZE)
plt.rc('legend', fontsize=SMALL_SIZE)
plt.rc('figure', titlesize=SMALL_SIZE)
c_palette1 = ["#264653","#2a9d8f","#e9c46a","#f4a261","#e76f51", "#d00000", "#9d0208", "#6a040f"]
c_palette2 = ["#9a031e", "#fb8b24", "#e36414", "#0f4c5c", "#5C474D"]
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[i for i in c_palette2]) 

In [2]:
preprocessed_data = pd.read_csv('../data/preprocessed_data.csv')

Crete dependent variable of the problem

In [17]:
def label_creation (row):
    if row['reservation_status'] =='Check-Out' and\
        row['booking_changes']>0:
        return 'Changed'
    elif row['reservation_status'] =='Check-Out' and\
        row['booking_changes']==0:
        return 'Not changed'
    else:
        return row['reservation_status']


In [20]:
dependent_variable = preprocessed_data.apply (lambda row: label_creation(row), axis=1)
print(dependent_variable.value_counts())
preprocessed_data['reservation_status'] = dependent_variable

In [23]:
preprocessed_data.dtypes

hotel                              object
is_canceled                          bool
lead_time                          object
arrival_date_year                   int64
arrival_month_sin                 float64
arrival_month_cos                 float64
total_nights                        int64
adults                              int64
has_children                         bool
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                    bool
previous_cancellations              int64
previous_bookings_not_canceled      int64
assigned_room_type_as_reserved       bool
booking_changes                     int64
deposit_type                       object
agent                              object
company                            object
was_in_waiting_list                  bool
customer_type                      object
required_car_parking_spaces       

Double check that object columns are nominal:

In [22]:
for col in preprocessed_data:
    if preprocessed_data[col].dtypes == 'object':
        print('\n' +col)
        print(np.unique(preprocessed_data[col]))


hotel
['City Hotel' 'Resort Hotel']

lead_time
['long-term' 'moderate' 'short-term' 'very-long-term' 'very-short-term']

meal
['BB' 'FB' 'HB' 'SC' 'Undefined']

country
['ABW' 'AGO' 'AIA' 'ALB' 'AND' 'ARE' 'ARG' 'ARM' 'ASM' 'ATA' 'ATF' 'AUS'
 'AUT' 'AZE' 'BDI' 'BEL' 'BEN' 'BFA' 'BGD' 'BGR' 'BHR' 'BHS' 'BIH' 'BLR'
 'BOL' 'BRA' 'BRB' 'BWA' 'CAF' 'CHE' 'CHL' 'CHN' 'CIV' 'CMR' 'CN' 'COL'
 'COM' 'CPV' 'CRI' 'CUB' 'CYM' 'CYP' 'CZE' 'DEU' 'DJI' 'DMA' 'DNK' 'DOM'
 'DZA' 'ECU' 'EGY' 'ESP' 'EST' 'ETH' 'FIN' 'FJI' 'FRA' 'FRO' 'GAB' 'GBR'
 'GEO' 'GGY' 'GHA' 'GIB' 'GLP' 'GNB' 'GRC' 'GTM' 'GUY' 'HKG' 'HND' 'HRV'
 'HUN' 'IDN' 'IMN' 'IND' 'IRL' 'IRN' 'IRQ' 'ISL' 'ISR' 'ITA' 'JAM' 'JEY'
 'JOR' 'JPN' 'KAZ' 'KEN' 'KHM' 'KIR' 'KNA' 'KOR' 'KWT' 'LAO' 'LBN' 'LBY'
 'LCA' 'LIE' 'LKA' 'LTU' 'LUX' 'LVA' 'MAC' 'MAR' 'MCO' 'MDG' 'MDV' 'MEX'
 'MKD' 'MLI' 'MLT' 'MMR' 'MNE' 'MOZ' 'MRT' 'MUS' 'MWI' 'MYS' 'MYT' 'NAM'
 'NCL' 'NGA' 'NIC' 'NLD' 'NOR' 'NPL' 'NZL' 'OMN' 'PAK' 'PAN' 'PER' 'PHL'
 'PLW' 'POL' 'PRI' 'PRT' 'PR

In [57]:
predictors = ['lead_time',
             'has_children', 'meal', 'adults', 'distribution_channel',
             'previous_cancellations', 
             'deposit_type', 'customer_type']
#['lead_time', 'arrival_month_sin', 'total_nights']
#             'has_children', 'meal', 'adults', 'distribution_channel',
#             'previous_cancellations', 'assigned_room_type_as_reserved',
#             'deposit_type', 'customer_type', 'has_special_requests']
df = preprocessed_data[predictors]
y = preprocessed_data['reservation_status']

In [58]:
df = pd.get_dummies(df)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.4, random_state=42)


### RF

In [67]:
param_grid = {'max_depth': [5, 10, 15],
              'min_samples_split': [2, 5, 10]}
              #'criterion': ['gini', 'entropy'],
              #'n_estimators': [50,100,150] }
rf = RandomForestClassifier(random_state=0)
clf = GridSearchCV(rf, param_grid)
clf.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=0),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 150]})

In [68]:
clf.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=15, min_samples_split=5,
                       random_state=0)

In [69]:
y_pred = clf.predict(X_test)

In [70]:
accuracy_score(y_test, y_pred, normalize=True)

0.6568938528578077

### SVC

In [74]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [None]:
param_grid = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
              'degree': [1, 2, 3]}
rf = RandomForestClassifier(random_state=0)
svc = SVC()
clf = GridSearchCV(svc, param_grid)
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
clf.feature_importances_

In [None]:
y_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred, normalize=True)