# Lab about missing value, sampling and threshold ajustement

Build a pipeline with:
- Impute or delete of missing values (transformer or sampler)
- Adjust classes (RUS, ROS, SMOTE strategy)
- One hot encode categorical features and normalize numerical features (transformer)
- Ajust threshold to maximize accuracy (predictor with logistic regression)
    - fit method select best threshold for accuracy score
    - predict method use the previously chosen threshold
- Put on slack your best score 

If you have time, you can do a feature selection step.

## Dataset

This data set contains booking information for a city hotel and a resort hotel, and includes information such as when the booking was made, length of stay, the number of adults, children, and/or babies, and the number of available parking spaces, among other things.

You'll build a model to predict hotel cancellations with a binary classifier.

In [4]:
import pandas as pd
import numpy as np

In [5]:
hotel = pd.read_csv('data/hotel_bookings.csv')

In [6]:
X = hotel.copy()
y = X.pop('is_canceled')

X['arrival_date_month'] = \
    X['arrival_date_month'].map(
        {'January':1, 'February': 2, 'March':3,
         'April':4, 'May':5, 'June':6, 'July':7,
         'August':8, 'September':9, 'October':10,
         'November':11, 'December':12}
    )

features_num = [
    "lead_time", "arrival_date_week_number",
    "arrival_date_day_of_month", "stays_in_weekend_nights",
    "stays_in_week_nights", "adults", "children", "babies",
    "is_repeated_guest", "previous_cancellations",
    "previous_bookings_not_canceled", "required_car_parking_spaces",
    "total_of_special_requests", "adr",
]
features_cat = [
    "hotel", "arrival_date_month", "meal",
    "market_segment", "distribution_channel",
    "reserved_room_type", "deposit_type", "customer_type",
]

## It's your turn

In [85]:
from sklearn.metrics import accuracy_score

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

transformer_num = Pipeline([
    ("imp", SimpleImputer(strategy="mean")), # there are a few missing values
    ("enc", StandardScaler()),
])
transformer_cat = Pipeline([
    ("imp", SimpleImputer(strategy="constant", fill_value="NA")),
    ("enc", OneHotEncoder(handle_unknown='ignore', sparse=False)),
])

pre = ColumnTransformer(transformers=[
    ("num", transformer_num, features_num),
    ("cat", transformer_cat, ["hotel"])
])

class myLogisticRegression(BaseEstimator):
    def __init__(self, max_iter=5000):
        self.max_iter = max_iter
        self.thresholds = np.linspace(0, 1.00, 101)
        self.clf = LogisticRegression(max_iter=self.max_iter)
    
    def fit(self, X, y):
        self.clf.fit(X, y)
        y_ = self.clf.predict_proba(X)[:, 1]  
        ACC = [accuracy_score(y, y_>=t) for t in self.thresholds]
        self.threshold = self.thresholds[np.argmax(ACC)]
        return self.clf

    def predict(self, X, y=None):
        return self.clf.predict_proba(X)[:, 1]>=self.threshold
    
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))
    
pipeline = Pipeline([
    ("pre", pre),
    ("nb_features", PCA()),
    ("sampling", RandomOverSampler(sampling_strategy="auto")),
    ("clf", myLogisticRegression(False))
])

pipeline

In [86]:
# stratify - make sure classes are evenlly represented across splits
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.75)

In [87]:
from sklearn.model_selection import GridSearchCV

params = {
   'nb_features': [None, PCA(), PCA(10), SelectKBest(f_classif, k=10), SelectKBest(mutual_info_classif, k=10)],
   'sampling':[None, RandomOverSampler(sampling_strategy=1.0), RandomUnderSampler(sampling_strategy=1.0), SMOTE(sampling_strategy=1.0)],
   'clf':[myLogisticRegression(max_iter=20000), LogisticRegression(max_iter=20000)], 
}
grid = GridSearchCV(pipeline, params, cv=5, scoring="accuracy", refit=True, verbose=2)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print("Best: {:.3f} using {}".format(
    grid.best_score_, 
    grid.best_params_
))

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=None, sampling=None; total time=   0.9s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=None, sampling=None; total time=   0.8s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=None, sampling=None; total time=   0.8s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=None, sampling=None; total time=   1.1s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=None, sampling=None; total time=   1.1s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=None, sampling=RandomOverSampler(sampling_strategy=1.0); total time=   1.1s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=None, sampling=RandomOverSampler(sampling_strategy=1.0); total time=   1.1s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=None, sampling=RandomOverSampler(sampling_strategy=1.0); total time=   1.2s
[CV] E

[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=None; total time=   0.8s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=None; total time=   0.9s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=None; total time=   0.9s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=RandomOverSampler(sampling_strategy=1.0); total time=   1.0s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=RandomOverSampler(sampling_strategy=1.0); total time=   1.0s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=RandomOverSampler(sampling_strategy=1.0); total time=   1.0s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=RandomOverSampler(sampling_strategy=1.0); total time=   1.0s
[CV] END clf=myLogisticRegression(max_iter=20000), nb_features=SelectKBe

[CV] END clf=LogisticRegression(max_iter=20000), nb_features=None, sampling=SMOTE(sampling_strategy=1.0); total time=   1.9s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=None, sampling=SMOTE(sampling_strategy=1.0); total time=   2.0s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=None, sampling=SMOTE(sampling_strategy=1.0); total time=   2.0s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=None, sampling=SMOTE(sampling_strategy=1.0); total time=   2.0s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=None, sampling=SMOTE(sampling_strategy=1.0); total time=   1.9s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=PCA(), sampling=None; total time=   0.4s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=PCA(), sampling=None; total time=   0.4s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=PCA(), sampling=None; total time=   0.3s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=PCA(), s

[CV] END clf=LogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=SMOTE(sampling_strategy=1.0); total time=   1.8s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=SMOTE(sampling_strategy=1.0); total time=   2.5s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=SelectKBest(), sampling=SMOTE(sampling_strategy=1.0); total time=   1.9s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=SelectKBest(score_func=<function mutual_info_classif at 0x7fc7a40ba0d0>), sampling=None; total time=   5.6s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=SelectKBest(score_func=<function mutual_info_classif at 0x7fc7a40ba0d0>), sampling=None; total time=   5.6s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=SelectKBest(score_func=<function mutual_info_classif at 0x7fc7a40ba0d0>), sampling=None; total time=   5.3s
[CV] END clf=LogisticRegression(max_iter=20000), nb_features=SelectKBest(score_func=<functi

In [92]:
cv_results = grid.cv_results_
scores_df = pd.DataFrame(cv_results)
scores_df[['param_nb_features', 'param_sampling', 'param_clf', 'mean_test_score', 'std_test_score']].sort_values('mean_test_score', ascending=False).head(40)

Unnamed: 0,param_nb_features,param_sampling,param_clf,mean_test_score,std_test_score
4,PCA(),,myLogisticRegression(max_iter=20000),0.728585,0.004435
0,,,myLogisticRegression(max_iter=20000),0.728518,0.004318
24,PCA(),,LogisticRegression(max_iter=20000),0.727614,0.003923
20,,,LogisticRegression(max_iter=20000),0.727614,0.003923
16,SelectKBest(score_func=<function mutual_info_c...,,myLogisticRegression(max_iter=20000),0.726631,0.005682
36,SelectKBest(score_func=<function mutual_info_c...,,LogisticRegression(max_iter=20000),0.72624,0.003738
12,SelectKBest(),,myLogisticRegression(max_iter=20000),0.725291,0.003517
32,SelectKBest(),,LogisticRegression(max_iter=20000),0.725079,0.003886
3,,SMOTE(sampling_strategy=1.0),myLogisticRegression(max_iter=20000),0.716066,0.004715
2,,RandomUnderSampler(sampling_strategy=1.0),myLogisticRegression(max_iter=20000),0.715519,0.006872
