In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/H1.csv')
df.append(pd.read_csv('../data/H2.csv'))
df.columns

Index(['IsCanceled', 'LeadTime', 'ArrivalDateYear', 'ArrivalDateMonth',
       'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
       'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children',
       'Babies', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel',
       'IsRepeatedGuest', 'PreviousCancellations',
       'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType',
       'BookingChanges', 'DepositType', 'Agent', 'Company',
       'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces',
       'TotalOfSpecialRequests', 'ReservationStatus', 'ReservationStatusDate'],
      dtype='object')

In [3]:
X = df.iloc[:, df.columns!='ReservationStatus']
y = df.iloc[:, -2]
print(X.shape)
print(y.shape)
print(type(y.values))

(40060, 30)
(40060,)
<class 'numpy.ndarray'>


In [4]:
y, labels = pd.factorize(y.values)

In [5]:
y = pd.Series(y)

In [6]:
X.columns

Index(['IsCanceled', 'LeadTime', 'ArrivalDateYear', 'ArrivalDateMonth',
       'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
       'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children',
       'Babies', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel',
       'IsRepeatedGuest', 'PreviousCancellations',
       'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType',
       'BookingChanges', 'DepositType', 'Agent', 'Company',
       'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces',
       'TotalOfSpecialRequests', 'ReservationStatusDate'],
      dtype='object')

In [7]:
categorical = ['Agent', 'ArrivalDateMonth', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel', 'ReservedRoomType', 'AssignedRoomType', 'DepositType', 'CustomerType', 'ArrivalDateYear']

In [8]:
X.Company.value_counts()

       NULL    36952
        223      784
        281      138
        154      133
        405      100
         94       87
        135       64
        498       58
        331       58
         47       56
        110       51
         20       50
        342       48
        270       43
        195       38
        174       36
        113       36
        204       34
        307       34
        269       33
         86       32
        308       32
        343       29
        178       27
        169       26
        337       25
        507       23
          9       22
         88       22
        477       22
               ...  
        520        1
        407        1
        373        1
        413        1
        410        1
        496        1
        386        1
         80        1
        126        1
        242        1
        102        1
        109        1
        318        1
        425        1
        501        1
        193        1
        478  

In [9]:
#Dropping the column Company
X.drop(['Company'], axis = 1, inplace = True)

In [10]:
#Dropping the column IsCancelled
X.drop(['IsCanceled'], axis = 1, inplace = True)

In [11]:
#Dropping the column ReservationStatusDate
X.drop(['ReservationStatusDate'], axis = 1, inplace = True)

In [12]:
#Dropping the column Children
X.drop(['Children'], axis = 1, inplace = True)

In [13]:
#for cat in categorical:
#    X[cat], l = pd.factorize(X[cat])

In [14]:
X = pd.get_dummies(X, columns = categorical, drop_first=True)

In [15]:
X.columns

Index(['LeadTime', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
       'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Babies',
       'IsRepeatedGuest', 'PreviousCancellations',
       'PreviousBookingsNotCanceled',
       ...
       'AssignedRoomType_I               ',
       'AssignedRoomType_L               ',
       'AssignedRoomType_P               ', 'DepositType_Non Refund     ',
       'DepositType_Refundable     ', 'CustomerType_Group',
       'CustomerType_Transient', 'CustomerType_Transient-Party',
       'ArrivalDateYear_2016', 'ArrivalDateYear_2017'],
      dtype='object', length=373)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [17]:
print(X.head())

   LeadTime  ArrivalDateWeekNumber  ArrivalDateDayOfMonth  \
0       342                     27                      1   
1       737                     27                      1   
2         7                     27                      1   
3        13                     27                      1   
4        14                     27                      1   

   StaysInWeekendNights  StaysInWeekNights  Adults  Babies  IsRepeatedGuest  \
0                     0                  0       2       0                0   
1                     0                  0       2       0                0   
2                     0                  1       1       0                0   
3                     0                  1       1       0                0   
4                     0                  2       2       0                0   

   PreviousCancellations  PreviousBookingsNotCanceled          ...           \
0                      0                            0          ...            


In [18]:
from sklearn.preprocessing import StandardScaler  
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [19]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 10, random_state = 0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test);

  from numpy.core.umath_tests import inner1d


In [20]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred, y_test))

0.890564153769346


In [21]:
from sklearn.model_selection import cross_val_score  
all_accuracies = cross_val_score(estimator = clf, X = X_test, y = y_test, cv = 5)
all_accuracies

array([0.86526946, 0.87019471, 0.86769845, 0.87718422, 0.86763237])