In [41]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Transformers
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

pd.set_option("display.max_rows", 500)

In [11]:
url = 'https://raw.githubusercontent.com/pedroteche-ih/DAFT_MEX_202209/main/data/tb_hotel_traintest.csv'
tb_hotel = pd.read_csv(url, parse_dates = ['arrival_date', 'reservation_status_date'])


In [13]:
tb_hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113409 entries, 0 to 113408
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   hotel                           113409 non-null  object        
 1   is_cancelled                    113409 non-null  int64         
 2   lead_time                       113409 non-null  int64         
 3   stays_in_weekend_nights         113409 non-null  int64         
 4   stays_in_week_nights            113409 non-null  int64         
 5   adults                          113409 non-null  int64         
 6   children                        113406 non-null  float64       
 7   babies                          113409 non-null  int64         
 8   meal                            113409 non-null  object        
 9   country                         112951 non-null  object        
 10  market_segment                  113409 non-null  object 

# Building & Evaluating Models in SKLEARN

In [27]:
def clean_hotel_data(hotel_data):
    hotel_data['children'] = hotel_data['children'].fillna(0)
    hotel_data['country'] = hotel_data['country'].fillna('Unknown')
    hotel_data['is_company'] = np.where(hotel_data['company'].isna(), 0, 1)
    hotel_data['is_agent'] = np.where(hotel_data['agent'].isna(), 0, 1)
    hotel_data = hotel_data.drop(['company', 'agent', 'id_booking', 'reservation_status_date'], axis = 1).dropna()

    return hotel_data

In [29]:
tb_hotel_clean = clean_hotel_data(tb_hotel)
X = tb_hotel_clean.drop('is_cancelled', axis = 1)
y = tb_hotel_clean['is_cancelled']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [30]:
X_train_num = X_train.select_dtypes(include = 'number')
X_test_num = X_test[X_train_num.columns]

## Building a simple model

In [42]:
power_trans = PowerTransformer()
scaler = StandardScaler()
pca = PCA()
dt_fit = DecisionTreeClassifier()
log_fit = LogisticRegression()

X_train_num_t = power_trans.fit_transform(X_train_num)
X_train_num_s = scaler.fit_transform(X_train_num_t)
X_train_num_p = scaler.fit_transform(X_train_num_s)

In [43]:
log_fit.fit(X_train_num_p, y_train)

LogisticRegression()

In [44]:
para_grid = {
    'max_depth' : [int(x) for x in np.linspace(1, 150, 5)],
    'min_samples_split' : [int(x) for x in np.linspace(2, 100, 5)],
    'min_samples_leaf' : [int(x) for x in np.linspace(1, 50, 5)],
    }

cv_fit = GridSearchCV(dt_fit, param_grid= para_grid)
cv_fit.fit(X_train_num_p, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 5, 10, 15, 20],
                         'min_samples_leaf': [1, 13, 25, 37, 50],
                         'min_samples_split': [2, 26, 51, 75, 100]})

### Evaluating models

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [45]:
X_test_num_t = power_trans.transform(X_test_num)
X_test_num_s = scaler.transform(X_test_num_t)
X_test_num_p = scaler.transform(X_test_num_s)

In [47]:
y_pred_log = log_fit.predict(X_test_num_p)
y_pred_dt = cv_fit.predict(X_test_num_p)

#### Accuracy

*Out of all predicitions, what % were correct?*

In [50]:
log_acc = np.round(accuracy_score(y_test, y_pred_log), 2)
dt_acc = np.round(accuracy_score(y_test, y_pred_dt), 2)

In [51]:
print(f'Logistic Regression Accuracy: {log_acc}')
print(f'D.T. Regression Accuracy: {dt_acc}')

Logistic Regression Accuracy: 0.75
D.T. Regression Accuracy: 0.79


#### Precision

*Out of all cancellation predicitions, what % were correct?*

**Precision is the ratio between True Positives (correct positive predictions) and True/False Positives (all positive predictions).**

In [54]:
log_prc = np.round(precision_score(y_test, y_pred_log), 2)
dt_prc = np.round(precision_score(y_test, y_pred_dt), 2)

print(f'Logistic Regression Precision: {log_prc}')
print(f'D.T. Regression Precision: {dt_prc}')

Logistic Regression Precision: 0.71
D.T. Regression Precision: 0.76


#### Recall

*Out of all real cancellations, what % were correctly predicted?*

**Recall is the ration between True Positives (correct positive predictions) and Real Positives (observed positive outcomes).**

In [58]:
log_rec = np.round(recall_score(y_test, y_pred_log), 2)
dt_rec = np.round(recall_score(y_test, y_pred_dt), 2)

print(f'Logistic Regression Recall: {log_rec}')
print(f'D.T. Regression Recall: {dt_rec}')

Logistic Regression Recall: 0.57
D.T. Regression Recall: 0.65


#### f1-Score

**The *harmonic mean* between precision and recall:**

$$f_1 = 2 \frac{precision * recall}{precision + recall}$$

A **f1-score = 1** means our model has perfect precision (all its cancellation predictions were cancellations) and perfect recall (all observed cancellations were predicted correctly). The lower the f1-score the farther we are from a good model.

In [56]:
log_f1 = np.round(f1_score(y_test, y_pred_log), 2)
dt_f1 = np.round(f1_score(y_test, y_pred_dt), 2)

print(f'Logistic Regression F1: {log_f1}')
print(f'D.T. Regression F1: {dt_f1}')

Logistic Regression Precision: 0.63
D.T. Regression Precision: 0.7


#### Utilizing precision, recall and F1 for model evaluation

In [82]:
def evaluate_model(y_true, y_pred):
    prc = np.round(precision_score(y_true, y_pred), 2)
    f1 = np.round(f1_score(y_true, y_pred), 2)
    rec = np.round(recall_score(y_true, y_pred), 2)

    print(f'Logistic Regression F1: {f1}')
    print(f'Logistic Regression Precision: {prc}')
    print(f'Logistic Regression Recall: {rec}')

In [60]:
evaluate_model(y_test, y_pred_log)

Logistic Regression F1: 0.63
D.T. Regression F1: 0.7
Logistic Regression Precision: 0.71
D.T. Regression Precision: 0.76
Logistic Regression Recall: 0.57
D.T. Regression Recall: 0.65


## Building Pipelines

In [72]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

In [61]:
power_trans = PowerTransformer()
scaler = StandardScaler()
pca = PCA()
dt_fit = DecisionTreeClassifier()
blocks = [('TRANS', power_trans), ('SCALE', scaler), ('PCA', pca), ('MODEL', dt_fit)]

pipeline = Pipeline(blocks)

In [88]:
num_vars = X_train_num.shape[1]

para_grid = {
    'PCA__n_components' : range(1, num_vars), 
    'MODEL__max_depth' : [int(x) for x in np.linspace(1, 150, 50)],
    'MODEL__min_samples_split' : [int(x) for x in np.linspace(2, 100, 50)],
    'MODEL__min_samples_leaf' : [int(x) for x in np.linspace(1, 50, 50)],
}

cv_fit = RandomizedSearchCV(pipeline, param_distributions= para_grid, n_iter = 10, scoring = 'f1')
cv_fit.fit(X_train_num, y_train);

RandomizedSearchCV(estimator=Pipeline(steps=[('TRANS', PowerTransformer()),
                                             ('SCALE', StandardScaler()),
                                             ('PCA', PCA()),
                                             ('MODEL',
                                              DecisionTreeClassifier())]),
                   param_distributions={'MODEL__max_depth': [1, 4, 7, 10, 13,
                                                             16, 19, 22, 25, 28,
                                                             31, 34, 37, 40, 43,
                                                             46, 49, 52, 55, 58,
                                                             61, 64, 67, 70, 73,
                                                             77, 80, 83, 86, 89, ...],
                                        'MODEL__min_samples_leaf': [1, 2, 3, 4,
                                                                    5, 6, 7, 8,
          

In [90]:
y_pred_pipeline = cv_fit.predict(X_test_num)
evaluate_model(y_test, y_pred_log)

Logistic Regression F1: 0.63
Logistic Regression Precision: 0.71
Logistic Regression Recall: 0.57


In [89]:
cv_fit.best_estimator_

Pipeline(steps=[('TRANS', PowerTransformer()), ('SCALE', StandardScaler()),
                ('PCA', PCA(n_components=7)),
                ('MODEL',
                 DecisionTreeClassifier(max_depth=73, min_samples_leaf=22,
                                        min_samples_split=14))])

In [93]:
para_grid = {
    'PCA__n_components' : range(5, num_vars), 
    'MODEL__max_depth' : [int(x) for x in np.linspace(63, 83, 5)],
    'MODEL__min_samples_split' : [int(x) for x in np.linspace(12, 32, 5)],
    'MODEL__min_samples_leaf' : [int(x) for x in np.linspace(4, 24, 5)],
}

cv_fit = GridSearchCV(pipeline, param_grid= para_grid, scoring = 'f1')
cv_fit.fit(X_train_num, y_train)

KeyboardInterrupt: 

In [None]:
y_pred_pipeline = cv_fit.predict(X_test_num)
evaluate_model(y_test, y_pred_log)