# Simulation studies using Hastie's data

The goal of this simulation study is to see how the performance of the Bayesian-encoded model varies for different values of the hyperparameters. 


In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
import sys,os, pathlib
current = pathlib.Path(os.getcwd())
base = current.parent.parent
catenc = base.joinpath('categorical-encoding')
sys.path.append(str(catenc))

# Binary classification problem

For Binary classifier we will work with the example 10.2 of T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009.

In [3]:
from sklearn.datasets import make_hastie_10_2
X_h, y_h = make_hastie_10_2(random_state=2834)
X_h = X_h.astype('float16')
y_h[y_h==-1]=0

In [4]:
#Now convert the last column to the categorical
from sklearn.preprocessing import KBinsDiscretizer
disczr1 = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')
cat_column1 = disczr1.fit_transform(X_h[:,-1].reshape(-1, 1)) * 193 % 20 #We want to break the monotonicity
disczr2 = KBinsDiscretizer(n_bins=15, encode='ordinal', strategy='uniform')
cat_column2 = disczr2.fit_transform(X_h[:,-2].reshape(-1, 1)) * 173 % 20 #We want to break the monotonicity

In [5]:
predictors = pd.DataFrame(X_h[:, 0:-2], columns=[f'col_{i}' for i in range(8)])
predictors['cat1'] = cat_column1
predictors['cat2'] = cat_column2
#predictors['cat1_orig'] = cat_column1
#predictors['cat2_orig'] = cat_column2
predictors.head(3)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,cat1,cat2
0,-1.373047,-2.091797,1.708984,-0.275146,-0.398926,1.024414,-0.765137,-0.189331,4.0,11.0
1,0.469238,1.482422,0.57373,1.517578,-0.036804,-0.18811,-0.654785,1.072266,17.0,18.0
2,-0.405518,0.231201,-1.037109,-0.901855,-2.525391,0.429199,-1.176758,-0.426025,17.0,11.0


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
model = RandomForestClassifier(n_estimators=400, max_depth=40, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds).round(4))

Train accuracy:  1.0
Test accuracy:  0.84875
AUC:  0.9255


Hyperparameter tuning: optimizing for AUC
estimators: 400
max depth:
* 15 | 0.9334 
* 17 | 0.9384
* 19 | 0.9398
* 21 | 0.9415
* 25 | 0.9449
* 30 | 0.947
* 10 | 0.9476

## Cross-validation

We really should use cross-validation to avoid overfitting

### Cross-validation of the target encoding model

First we will train a model using target encoding

In [8]:
%%time
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from category_encoders.leave_one_out import LeaveOneOutEncoder

loo = LeaveOneOutEncoder(cols=['cat1', 'cat2'], sigma=0.05, random_state=2834)
rf = RandomForestClassifier(n_estimators=400, max_depth=30, max_features=1, min_samples_leaf=1,
                            random_state=2834, n_jobs=-1) 
pipe = Pipeline(steps=[('loo',loo), ('rf',rf)])

param_grid = {
    'loo__sigma': [0.01, 0.05],
    'rf__max_depth': [15,20],
    'rf__max_features' : [1,2],
    'rf__min_samples_leaf': [2,3]
}

X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)
search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1,)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
test_predict = search.best_estimator_.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, test_predict))

Best parameter (CV score=0.898):
{'loo__sigma': 0.01, 'rf__max_depth': 20, 'rf__max_features': 1, 'rf__min_samples_leaf': 2}
Test accuracy:  0.89875
Wall time: 52.6 s


### Cross-validation of the probabilistic encoder

First we create a class that makes it easier for us to run sklearn cross validation

In [9]:
from category_encoders.pte_utils import EncoderWrapper
from category_encoders.posterior_imputation_bc import PosteriorImputationEncoderBC

In [10]:
%%time
from sklearn.model_selection import cross_val_score

pte = PosteriorImputationEncoderBC(cols=['cat1', 'cat2'], random_state=2834)
model = RandomForestClassifier(n_estimators=400, random_state=2834, n_jobs=-1) 
wrapper_model = EncoderWrapper(pte, model)

param_grid = {
    'encoder__n_draws': [3,4],
    'encoder__prior_samples_ratio': [1E-4, 1E-3],
    'classifier__max_depth': [30,40],
    'classifier__max_features' : [1,2],
    'classifier__min_samples_leaf': [3,4]
}


X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)

search = GridSearchCV(wrapper_model, param_grid, cv=5, n_jobs=-1)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

test_predict = search.best_estimator_.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, test_predict))


Best parameter (CV score=0.880):
{'classifier__max_depth': 40, 'classifier__max_features': 1, 'classifier__min_samples_leaf': 4, 'encoder__n_draws': 4, 'encoder__prior_samples_ratio': 0.0001}
Test accuracy:  0.895
Wall time: 5min 53s


In [11]:
search.cv_results_

{'mean_fit_time': array([11.76919317, 15.67619405, 15.55459809, 19.69440112, 16.06360021,
        11.35519876, 18.79779987, 14.17840004, 23.76259956, 21.74739819,
        28.75391932, 32.50232091, 19.90979691, 24.02899761, 27.01699944,
        28.68699951, 19.42459788, 11.59499745, 17.6623991 , 15.9279995 ,
        13.61619925, 14.35919867, 15.8619998 , 19.30059986, 19.9371984 ,
        23.92839899, 28.17279968, 30.18139944, 25.54079967, 21.95979862,
        29.01060014, 22.01720052]),
 'std_fit_time': array([3.74980673, 1.03061134, 2.07141987, 0.42447696, 4.27303944,
        3.90891133, 0.37695909, 3.32639901, 0.48382333, 3.24767522,
        3.88100039, 0.39722286, 2.64603977, 0.73065212, 0.93557649,
        0.28336398, 4.75345755, 0.89602601, 0.2455634 , 2.37307076,
        1.12008744, 1.72549498, 1.75905643, 0.28357636, 1.91257604,
        0.54081095, 0.58686841, 1.1041225 , 3.69907088, 1.77019443,
        0.84929072, 5.01405428]),
 'mean_score_time': array([5.77540083, 2.40260139, 