In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
import sys,os, pathlib
current = pathlib.Path(os.getcwd())
base = current.parent.parent
catenc = base.joinpath('categorical-encoding')
sys.path.append(str(catenc))

# Binary classification problem

For Binary classifier we will work with the example 10.2 of T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009.

In [3]:
from sklearn.datasets import make_hastie_10_2
X_h, y_h = make_hastie_10_2(random_state=2834)

In [22]:
#Now convert the last column to the categorical
from sklearn.preprocessing import KBinsDiscretizer
disczr1 = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')
cat_column1 = disczr1.fit_transform(X_h[:,-1].reshape(-1, 1)) * 193 % 20 #We want to break the monotonicity
disczr2 = KBinsDiscretizer(n_bins=15, encode='ordinal', strategy='uniform')
cat_column2 = disczr2.fit_transform(X_h[:,-2].reshape(-1, 1)) * 173 % 20 #We want to break the monotonicity

In [23]:
predictors = pd.DataFrame(X_h[:, 0:-2], columns=[f'col_{i}' for i in range(8)])
predictors['cat1'] = cat_column1
predictors['cat2'] = cat_column2
#predictors['cat1_orig'] = cat_column1
#predictors['cat2_orig'] = cat_column2
predictors.head(3)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,cat1,cat2
0,-1.372591,-2.090973,1.708557,-0.2752,-0.398902,1.02447,-0.765034,-0.189323,4.0,11.0
1,0.46918,1.482655,0.573892,1.517456,-0.036815,-0.18815,-0.654887,1.071954,17.0,18.0
2,-0.405521,0.231156,-1.036971,-0.901881,-2.526267,0.429153,-1.177047,-0.425995,17.0,11.0


In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
model = RandomForestClassifier(n_estimators=400, max_depth=40, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds).round(4))

Train accuracy:  1.0
Test accuracy:  0.8508333333333333
AUC:  0.9281


Hyperparameter tuning: optimizing for AUC
estimators: 400
max depth:
* 15 | 0.9334 
* 17 | 0.9384
* 19 | 0.9398
* 21 | 0.9415
* 25 | 0.9449
* 30 | 0.947
* 10 | 0.9476

In [27]:
#Now we will try to do target encoding
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.target_encoder import TargetEncoder

loo = LeaveOneOutEncoder(cols=['cat1', 'cat2'], random_state=2834)
loo.fit(pd.DataFrame(X_train, columns=predictors.columns), y_train)
X_train = loo.transform(pd.DataFrame(X_train, columns=predictors.columns))
X_test = loo.transform(pd.DataFrame(X_test, columns=predictors.columns))

model = RandomForestClassifier(n_estimators=400, max_depth=40, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds).round(4))

Train accuracy:  1.0
Test accuracy:  0.8929166666666667
AUC:  0.962


Hyperparameter tuning: optimizing for AUC
estimators: 400
max depth:
* 17 | 0.9545
* 20 | 0.956
* 30 | 0.96
* 40 | 0.9601


OK, Now we will try to use the probabilistic target encoder


In [None]:
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
from category_encoders.posterior_imputation_bc import PosteriorImputationEncoderBC 

pte = PosteriorImputationEncoderBC(cols=['cat1', 'cat2'], n_draws=25, random_state=2834, prior_samples_ratio=0.5)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)
pte.fit(X_train, y_train)
X_train = pte.transform(X_train)
X_test = pte.transform(X_test)
y_train = pte.expand_y(y_train)


model = RandomForestClassifier(n_estimators=400, max_depth=40, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]
preds = pte.average_y(preds)

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

## Cross-validation

We really should use cross-validation to avoid overfitting

### Cross-validation of the target encoding model

First we will train a model using target encoding

In [73]:
from sklearn.pipeline import make_pipeline

loo = LeaveOneOutEncoder(cols=['cat1', 'cat2'], sigma=0.05, random_state=2834)
rf = RandomForestClassifier(n_estimators=400, max_depth=30, max_features=1, min_samples_leaf=1,
                            random_state=2834, n_jobs=-1) 
clf = make_pipeline(loo, rf)

X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)
scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=-1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.91 (+/- 0.01)


Cross-validation results:
* max_depth=40: Accuracy: 0.67 (+/- 0.03)
* max_depth=30: Accuracy: 0.68 (+/- 0.03)
* max_depth=20: Accuracy: 0.67 (+/- 0.03)
* max_depth=25: Accuracy: 0.67 (+/- 0.03)
* max_depth=35: Accuracy: 0.67 (+/- 0.03)
* max_depth=30, max_features=3: Accuracy: 0.67 (+/- 0.03)
* max_depth=30, max_features=5: Accuracy: 0.58 (+/- 0.02)
* max_depth=30, max_features=2: Accuracy: 0.77 (+/- 0.02)
* max_depth=30, max_features=1: Accuracy: 0.87 (+/- 0.02)
* max_depth=30, max_features=1, min_samples_leaf=3: Accuracy: 0.88 (+/- 0.02)
* max_depth=30, max_features=1, min_samples_leaf=4: Accuracy: 0.87 (+/- 0.03)

* max_depth=30, sigma=0.05: Accuracy: 0.89 (+/- 0.02)
* max_depth=30, sigma=0.1: Accuracy: 0.89 (+/- 0.02)
* max_depth=30, sigma=0.2: Accuracy: 0.89 (+/- 0.02)
* max_depth=30, sigma=0.3: Accuracy: 0.87 (+/- 0.03)
* max_depth=30, sigma=0.2, max_features=1: Accuracy: 0.89 (+/- 0.02)
* max_depth=30, sigma=0.2, max_features=2: Accuracy: 0.87 (+/- 0.03)
* max_depth=30, sigma=0.2, max_features=1, min_samples_leaf=3: Accuracy: 0.88 (+/- 0.02)
* max_depth=30, sigma=0.2, max_features=1, min_samples_leaf=2: Accuracy: 0.88 (+/- 0.02)
* max_depth=30, sigma=0.05, max_features=1, min_samples_leaf=1: Accuracy: 0.91 (+/- 0.01)



**Optimal parameters**: sigma=0.05, max_depth=30, max_features=1, min_samples_leaf=1

**Accuracy**: 0.91 (+/- 0.01)


In [75]:
clf.fit(X_train, y_train)
test_predict = clf.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, test_predict))

Test accuracy:  0.9029166666666667


### Cross-validation of the probabilistic encoder

First we create a class that makes it easier for us to run sklearn cross validation

In [39]:
from sklearn.base import BaseEstimator, ClassifierMixin
class EncoderWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, encoder, classifier):
        self.encoder = encoder
        self.classifier = classifier
    
    def fit(self, X, y, **kwargs):
        self.encoder.fit(X, y)
        X_transformed = self.encoder.transform(X)
        y_transformed = self.encoder.expand_y(y)
        self.classifier.fit(X_transformed, y_transformed)
    
    def predict_proba(self, X):
        X_transformed = self.encoder.transform(X)
        preds = self.classifier.predict_proba(X_transformed)[:,1]
        return self.encoder.average_y(preds)
    
    def predict(self, X):
        return self.predict_proba(X).round()



In [102]:
%%time
from sklearn.model_selection import cross_val_score

pte = PosteriorImputationEncoderBC(cols=['cat1', 'cat2'], n_draws=5, random_state=2834, prior_samples_ratio=0)
model = RandomForestClassifier(n_estimators=400, max_depth=30, max_features=1, 
                               random_state=2834, n_jobs=-1) 
wrapper_model = EncoderWrapper(pte, model)

X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)

scores = cross_val_score(wrapper_model, X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.90 (+/- 0.01)
Wall time: 14.5 s


Cross-validation results:
* Below are n_draws = 25
* max_depth=40: Accuracy: 0.86 (+/- 0.03)
* max_depth=30, prior_samples_ratio=0.5: Accuracy: 0.86 (+/- 0.03)
* max_depth=30, prior_samples_ratio=0.1: Accuracy: 0.86 (+/- 0.02)
* max_depth=30, prior_samples_ratio=0.01: Accuracy: 0.87 (+/- 0.01)
* max_depth=30, prior_samples_ratio=0: Accuracy: 0.88 (+/- 0.01)
* n_draws = 5, prior_samples_ratio=0, max_depth=30: Accuracy: 0.89 (+/- 0.01)
* n_draws = 5, prior_samples_ratio=0, max_depth=30, max_features=1: Accuracy: 0.90 (+/- 0.01)

**The best parameters**: n_draws = 5, prior_samples_ratio=0, max_depth=30, max_features=1
**Accuracy**: 0.90 (+/- 0.01)




In [104]:
wrapper_model.fit(X_train, y_train)
test_predict = wrapper_model.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, test_predict))

Test accuracy:  0.8954166666666666


accuracy is again worse than for the target encoder