In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import PredefinedSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

import random

random.seed(42)

# matplotlib setup
SMALL_SIZE = 8
MEDIUM_SIZE = 11
LARGE_SIZE = 14

plt.rc('font', size=MEDIUM_SIZE)
plt.rc('axes', titlesize=MEDIUM_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rc('xtick', labelsize=SMALL_SIZE)
plt.rc('ytick', labelsize=SMALL_SIZE)
plt.rc('legend', fontsize=SMALL_SIZE)
plt.rc('figure', titlesize=SMALL_SIZE)
c_palette1 = ["#264653","#2a9d8f","#e9c46a","#f4a261","#e76f51", "#d00000", "#9d0208", "#6a040f"]
c_palette2 = ["#9a031e", "#fb8b24", "#e36414", "#0f4c5c", "#5C474D"]
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[i for i in c_palette2]) 

In [None]:
preprocessed_data = pd.read_csv('../data/preprocessed_data.csv')

In [None]:
preprocessed_data.drop(columns=['booking_changes'], inplace=True)

In [None]:
preprocessed_data.dtypes

In [None]:
fig, ax = plt.subplots(figsize=(4,4))
pd.plotting.parallel_coordinates(preprocessed_data.select_dtypes(include='object'),'reservation_status',
                                 color=c_palette1)
plt.xticks(rotation='vertical');
ax.get_yaxis().set_visible(False)

In [None]:
fig, ax = plt.subplots(figsize=(4,4))
tmp = preprocessed_data.select_dtypes(include=['float64', 'int64'])
tmp['reservation_status'] = preprocessed_data['reservation_status']
tmp.drop(columns='arrival_date_year', inplace=True)
pd.plotting.parallel_coordinates(tmp,'reservation_status', color=c_palette1)
plt.xticks(rotation='vertical');

Double check that object columns are nominal:

In [None]:
for col in preprocessed_data:
    if preprocessed_data[col].dtypes == 'object':
        print('\n' +col)
        print(np.unique(preprocessed_data[col]))

In [None]:
df = preprocessed_data.drop(columns=['reservation_status'] , inplace=False)
y = preprocessed_data['reservation_status']
df = pd.get_dummies(df)
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.1, random_state=42)


Define validation set

In [None]:
validation_split = np.zeros(len(X_train)).astype('int64')
ind = [random.randrange(0,len(X_train)) for i in range(int(len(validation_split)/10))]
validation_split[ind] = -1
cv = PredefinedSplit(validation_split)

### RF - tuning max_samples only

In [None]:
param_grid = {'max_samples' :[int(np.sqrt(len(X_train)/10)), int(np.sqrt(len(X_train)/5)),
                              int(np.sqrt(len(X_train)/20)), None]
             }
rf = RandomForestClassifier(random_state=42, n_estimators=250, max_depth= 10)
clf = GridSearchCV(rf, param_grid, cv=cv)
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
X_validation = X_train.values[ind]
y_validation = y_train.values[ind]
pd.Series(y_validation)

In [None]:
y_pred = clf.predict(X_validation)
accuracy_score(y_validation, y_pred, normalize=True)

In [None]:
print('f1_score: ', f1_score(y_validation, y_pred, average='micro'))
print('f1_score: ', f1_score(y_validation, y_pred, average='macro'))
print('precision_score: ', precision_score(y_validation, y_pred, average='micro'))
print('precision_score: ', precision_score(y_validation, y_pred, average='macro'))
print('recall_score: ', recall_score(y_validation, y_pred, average='micro'))
print('recall_score: ', recall_score(y_validation, y_pred, average='macro'))


### RF - tuning all parameters

In [None]:
param_grid = {'max_depth': [15, 20, 25, 30, 35],
              'min_samples_split': [2, 5, 10],
              'criterion': ['gini', 'entropy'],
              'n_estimators': [100,250, 400],
              'max_samples' :[int(np.sqrt(len(X_train)/10)), int(np.sqrt(len(X_train)/5)),
                              int(np.sqrt(len(X_train)/20)), None]
             }
rf = RandomForestClassifier(random_state=42)
clf = GridSearchCV(rf, param_grid, cv=cv)
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
y_pred = clf.predict(X_validation)
accuracy_score(y_validation, y_pred, normalize=True)

## KNN

In [None]:
predictors = ['deposit_type', 
              'arrival_month_sin',
              'arrival_month_cos',
              'hotel',
              'total_nights',
              'meal'
             ]
df = preprocessed_data[predictors]
y = preprocessed_data['reservation_status']
df = pd.get_dummies(df)
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.1, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
param_grid = {'n_neighbors':[3,5,11],
              'weights':['uniform', 'distance']
             }
knn =  KNeighborsClassifier(n_neighbors=3)
clf = GridSearchCV(knn, param_grid)
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
y_pred = clf.predict(X_validation)
accuracy_score(y_validation, y_pred, normalize=True)

### SVC

In [None]:
param_grid = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
              'degree': [1, 2, 3]}
svc = SVC()
clf = GridSearchCV(svc, param_grid)
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
y_pred = clf.predict(X_validation)
accuracy_score(y_validation, y_pred, normalize=True)

## Logistic

In [None]:
param_grid = {'penalty':['l1', 'l2', 'elasticnet', 'none'],
              'C': [1e-1, 1, 10]}
lr = LogisticRegression(solver='sag')
clf = GridSearchCV(lr, param_grid)
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
y_pred = clf.predict(X_validation)
accuracy_score(y_validation, y_pred, normalize=True)