In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Transformers
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
pd.set_option("display.max_rows", 500)
import warnings
warnings.filterwarnings("ignore")


In [None]:
url = "https://raw.githubusercontent.com/pedroteche-ih/DAFT_MEX_202209/main/data/tb_hotel_traintest.csv"
tb_hotel = pd.read_csv(url, parse_dates=["arrival_date", "reservation_status_date"])


In [None]:
tb_hotel.info()


# Building & Evaluating Models in SKLEARN

In [None]:
def clean_hotel_data(hotel_data):
    hotel_data["children"] = hotel_data["children"].fillna(0)
    hotel_data["country"] = hotel_data["country"].fillna("Unknown")
    hotel_data["is_company"] = np.where(hotel_data["company"].isna(), 0, 1)
    hotel_data["is_agent"] = np.where(hotel_data["agent"].isna(), 0, 1)
    hotel_data = hotel_data.drop(
        ["company", "agent", "id_booking", "reservation_status_date"], axis=1
    ).dropna()

    return hotel_data


In [None]:
tb_hotel_clean = clean_hotel_data(tb_hotel)

cat_vars = list(tb_hotel_clean.select_dtypes("object").columns)
num_vars = list(tb_hotel_clean.select_dtypes(include=np.number).drop("is_cancelled", axis=1).columns)


In [None]:
X = tb_hotel_clean.drop("is_cancelled", axis=1)
y = tb_hotel_clean["is_cancelled"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [None]:
X_train_num = X_train.select_dtypes(include="number")
X_test_num = X_test[X_train_num.columns]


## Building a simple model

In [None]:
power_trans = PowerTransformer()
scaler = StandardScaler()
pca = PCA()
dt_fit = DecisionTreeClassifier()
log_fit = LogisticRegression()

X_train_num_t = power_trans.fit_transform(X_train_num)
X_train_num_s = scaler.fit_transform(X_train_num_t)
X_train_num_p = scaler.fit_transform(X_train_num_s)


In [None]:
log_fit.fit(X_train_num_p, y_train)


In [None]:
para_grid = {
    "max_depth": [int(x) for x in np.linspace(1, 150, 5)],
    "min_samples_split": [int(x) for x in np.linspace(2, 100, 5)],
    "min_samples_leaf": [int(x) for x in np.linspace(1, 50, 5)],
}

cv_fit = GridSearchCV(dt_fit, param_grid=para_grid)
cv_fit.fit(X_train_num_p, y_train)


### Evaluating models

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


In [None]:
X_test_num_t = power_trans.transform(X_test_num)
X_test_num_s = scaler.transform(X_test_num_t)
X_test_num_p = scaler.transform(X_test_num_s)


In [None]:
y_pred_log = log_fit.predict(X_test_num_p)
y_pred_dt = cv_fit.predict(X_test_num_p)


#### Accuracy

*Out of all predicitions, what % were correct?*

In [None]:
log_acc = np.round(accuracy_score(y_test, y_pred_log), 2)
dt_acc = np.round(accuracy_score(y_test, y_pred_dt), 2)


In [None]:
print(f"Logistic Regression Accuracy: {log_acc}")
print(f"D.T. Regression Accuracy: {dt_acc}")


#### Precision

*Out of all cancellation predicitions, what % were correct?*

**Precision is the ratio between True Positives (correct positive predictions) and True+False Positives (all positive predictions).**

In [None]:
log_prc = np.round(precision_score(y_test, y_pred_log), 2)
dt_prc = np.round(precision_score(y_test, y_pred_dt), 2)

print(f"Logistic Regression Precision: {log_prc}")
print(f"D.T. Regression Precision: {dt_prc}")


#### Recall

*Out of all real cancellations, what % were correctly predicted?*

**Recall is the ration between True Positives (correct positive predictions) and Real Positives (observed positive outcomes).**

In [None]:
log_rec = np.round(recall_score(y_test, y_pred_log), 2)
dt_rec = np.round(recall_score(y_test, y_pred_dt), 2)

print(f"Logistic Regression Recall: {log_rec}")
print(f"D.T. Regression Recall: {dt_rec}")


#### f1-Score

**The *harmonic mean* between precision and recall:**

$$f_1 = 2 \frac{precision * recall}{precision + recall}$$

A **f1-score = 1** means our model has perfect precision (all its cancellation predictions were cancellations) and perfect recall (all observed cancellations were predicted correctly). The lower the f1-score the farther we are from a good model.

In [None]:
log_f1 = np.round(f1_score(y_test, y_pred_log), 2)
dt_f1 = np.round(f1_score(y_test, y_pred_dt), 2)

print(f"Logistic Regression F1: {log_f1}")
print(f"D.T. Regression F1: {dt_f1}")


#### Utilizing precision, recall and F1 for model evaluation

In [None]:
def evaluate_model(y_true, y_pred):
    prc = np.round(precision_score(y_true, y_pred), 2)
    f1 = np.round(f1_score(y_true, y_pred), 2)
    rec = np.round(recall_score(y_true, y_pred), 2)

    print(f"Logistic Regression F1: {f1}")
    print(f"Logistic Regression Precision: {prc}")
    print(f"Logistic Regression Recall: {rec}")


In [None]:
evaluate_model(y_test, y_pred_log)


In [None]:
evaluate_model(y_test, y_pred_dt)

## Building Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV


In [None]:
power_trans = PowerTransformer()
scaler = StandardScaler()
pca = PCA()
dt_fit = DecisionTreeClassifier()
blocks = [("TRANS", power_trans), ("SCALE", scaler), ("PCA", pca), ("MODEL", dt_fit)]

pipeline = Pipeline(blocks)


In [None]:
para_grid = {
    "PCA__n_components": range(1, len(num_vars)),
    "MODEL__max_depth": [int(x) for x in np.linspace(1, 150, 50)],
    "MODEL__min_samples_split": [int(x) for x in np.linspace(2, 100, 50)],
    "MODEL__min_samples_leaf": [int(x) for x in np.linspace(1, 50, 50)],
}

cv_fit = RandomizedSearchCV(
    pipeline, param_distributions=para_grid, n_iter=10, scoring="f1"
)
cv_fit.fit(X_train_num, y_train);


In [None]:
y_pred_pipeline = cv_fit.predict(X_test_num)
evaluate_model(y_test, y_pred_pipeline)


In [None]:
evaluate_model(y_test, y_pred_dt)

In [None]:
cv_fit.best_estimator_


### Creating Complex Pipelines

In [None]:
from sklearn.compose import ColumnTransformer


In [None]:
power_trans = PowerTransformer()
scaler = StandardScaler()
pca = PCA()
num_blocks = [("TRANS", power_trans), ("SCALE", scaler), ("PCA", pca)]

num_pipeline = Pipeline(num_blocks)


In [None]:
ohe = OneHotEncoder(drop="first", handle_unknown="ignore")
kbest = SelectKBest(score_func=mutual_info_classif)
cat_blocks = [("OHE", ohe), ("KB", kbest)]

cat_pipeline = Pipeline(cat_blocks)


In [None]:
data_prep_pipeline = ColumnTransformer(
    transformers=[
        ("NUMPREP", num_pipeline, num_vars),
        ("CATPREP", cat_pipeline, cat_vars),
    ]
)

In [None]:
log_fit = LogisticRegression()
pipeline = Pipeline(steps=[('PRE', data_prep_pipeline),
                           ('MODEL', log_fit)])

In [None]:
pipeline.get_params()

In [None]:
param_grid = {
    'PRE__NUMPREP__PCA__n_components' : range(1, len(num_vars)),
    'PRE__CATPREP__KB__k' : range(1, len(cat_vars))
}

grid_fit = RandomizedSearchCV(pipeline, param_grid, n_iter = 10, n_jobs = 7)
grid_fit.fit(X_train, y_train);

In [None]:
y_pred_pipeline = grid_fit.predict(X_test)
evaluate_model(y_test, y_pred_pipeline)


# Multi-classification Tasks

In [None]:
from sklearn import datasets

In [None]:
tb_wine = pd.DataFrame(datasets.load_wine(as_frame=True)['data'])
tb_wine['classif_wine'] = pd.DataFrame(datasets.load_wine(as_frame=True)['target'])

In [None]:
X = tb_wine.drop('classif_wine', axis = 1)
y = tb_wine['classif_wine']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
power_trans = PowerTransformer()
scaler = StandardScaler()
pca = PCA()
dt_fit = DecisionTreeClassifier()
blocks = [("TRANS", power_trans), ("SCALE", scaler), ("PCA", pca), ("MODEL", dt_fit)]

pipeline = Pipeline(blocks)


In [None]:
para_grid = {
    "PCA__n_components": range(1, X.shape[1]),
    "MODEL__max_depth": [int(x) for x in np.linspace(1, 150, 50)],
    "MODEL__min_samples_split": [int(x) for x in np.linspace(2, 100, 50)],
    "MODEL__min_samples_leaf": [int(x) for x in np.linspace(1, 50, 50)],
}

cv_fit = RandomizedSearchCV(
    pipeline, param_distributions=para_grid, n_iter=1000, scoring="f1"
)
cv_fit.fit(X_train, y_train);


In [None]:
cv_fit.best_estimator_

## Measuring error

In [None]:
y_pred = cv_fit.predict(X_test)
f1_score(y_test, y_pred, average = None)

In [None]:
y_train.value_counts()

## Building a model w/ Class Balancing

In [None]:
power_trans = PowerTransformer()
scaler = StandardScaler()
pca = PCA()
dt_fit = DecisionTreeClassifier(class_weight="balanced")
blocks = [("TRANS", power_trans), ("SCALE", scaler), ("PCA", pca), ("MODEL", dt_fit)]

pipeline = Pipeline(blocks)

para_grid = {
    "PCA__n_components": range(1, X.shape[1]),
    "MODEL__max_depth": [int(x) for x in np.linspace(1, 150, 50)],
    "MODEL__min_samples_split": [int(x) for x in np.linspace(2, 100, 50)],
    "MODEL__min_samples_leaf": [int(x) for x in np.linspace(1, 50, 50)],
}

cv_fit = RandomizedSearchCV(
    pipeline, param_distributions=para_grid, n_iter=1000, scoring="f1"
)
cv_fit.fit(X_train, y_train);


In [None]:
y_pred = cv_fit.predict(X_test)
f1_score(y_test, y_pred, average = None)

In [None]:
precision_score(y_test, y_pred, average = None)

### Calculating the aggregate error

#### Micro averaging

Calculates True Positives, False Positives and False Negatives for each class, totalling them and calculating the F1 score for these totals.

In [None]:
f1_score(y_test, y_pred, average = 'micro')

#### Macro averaging

**Unweighted average** F1-Score for each class - **gives equal importance to different classes**.

In [None]:
f1_score(y_test, y_pred, average = 'macro')

#### Weighted averaging

**Weighted average** F1-Score for each class - **gives proportional weights to classes with more observations**.

In [None]:
f1_score(y_test, y_pred, average = 'weighted')