In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve,precision_recall_curve
from sklearn.metrics import precision_recall_curve,f1_score, roc_auc_score

In [3]:
df_train = pd.read_csv('../data/train.csv')
df_train

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_month,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,2,3,1,0,1,244,10,1,0,0,0,85.85,0,1
1,2,0,0,3,1,0,1,276,9,1,0,0,0,95.00,0,1
2,2,0,1,2,1,0,1,233,10,1,0,0,0,90.00,0,1
3,2,0,2,2,1,0,1,156,7,1,0,0,0,93.08,1,1
4,2,0,0,3,1,0,1,76,4,1,0,0,0,69.33,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29015,1,0,2,5,1,0,1,34,11,1,0,0,0,74.80,2,1
29016,2,0,0,2,2,0,1,346,9,0,0,0,0,115.00,1,1
29017,3,0,0,4,1,0,4,175,8,1,0,0,0,130.05,1,1
29018,1,0,0,3,1,0,1,5,12,0,0,0,0,64.80,0,0


In [4]:
X = df_train.drop('booking_status', axis=1)
y = df_train['booking_status']

In [5]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = rus.fit_resample(X, y)

In [6]:
valor = y_resampled.value_counts()
suma = valor.sum()
porcentaje = valor*100/suma
print(porcentaje)

booking_status
0    50.0
1    50.0
Name: count, dtype: float64


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [9]:
print(X_resampled.shape)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(18968, 15)
(15174, 15)
(15174,)
(3794, 15)
(3794,)


In [10]:
y_resampled.value_counts(normalize=True)

booking_status
0    0.5
1    0.5
Name: proportion, dtype: float64

In [11]:
y_train.value_counts(normalize=True)

booking_status
1    0.501582
0    0.498418
Name: proportion, dtype: float64

In [12]:
y_test.value_counts(normalize=True)

booking_status
0    0.506326
1    0.493674
Name: proportion, dtype: float64

In [13]:
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("classifier", RandomForestClassifier())
])

In [14]:
log_params = {
    'selectkbest__k':np.arange(5,15),
    'classifier': [LogisticRegression()],
    'classifier__C': [0.1,1,10]
}
rf_params = {
    'scaler': [StandardScaler(), None],
    'selectkbest__k':np.arange(5,10),
    'classifier': [RandomForestClassifier()],
    'classifier__max_depth': [3,5,7]
}
gb_params = {
    'scaler': [StandardScaler(), None],
    'selectkbest__k':np.arange(5,15),
    'classifier': [GradientBoostingClassifier()],
    'classifier__max_depth': [3,5,7]
}
knn_params = {
    'selectkbest__k':np.arange(5,15),
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors': np.arange(5,15)
}
svm_params = {
    'selectkbest__k':np.arange(5,15),
    'classifier': [SVC()],
    'classifier__C': [0.1,1,10]
}

In [15]:
search_space = [
    log_params,
    rf_params,
    gb_params,
    knn_params,
    svm_params   
]

In [16]:
clf_gs = GridSearchCV(estimator=pipe, param_grid=search_space, cv=3, scoring="accuracy", verbose=3, n_jobs=-1)

In [17]:
clf_gs.fit(X_train, y_train)

Fitting 3 folds for each of 280 candidates, totalling 840 fits


In [18]:
print(clf_gs.best_estimator_)
print(clf_gs.best_score_)
print(clf_gs.best_params_)

Pipeline(steps=[('scaler', None), ('selectkbest', SelectKBest(k=14)),
                ('classifier', GradientBoostingClassifier(max_depth=7))])
0.857848952155002
{'classifier': GradientBoostingClassifier(), 'classifier__max_depth': 7, 'scaler': None, 'selectkbest__k': 14}


In [19]:
final_model = clf_gs.best_estimator_
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

In [20]:
print("accuracy_score", accuracy_score(y_test, y_pred))
print("precision_score", precision_score(y_test, y_pred))
print("recall_score", recall_score(y_test, y_pred))
print("roc_auc_score", roc_auc_score(y_test, y_pred))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

accuracy_score 0.8508170795993675
precision_score 0.8604522890237176
recall_score 0.8328884143085958
roc_auc_score 0.8505930879455526
confusion_matrix
 [[1668  253]
 [ 313 1560]]
