In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
import sys,os, pathlib
current = pathlib.Path(os.getcwd())
base = current.parent.parent
catenc = base.joinpath('categorical-encoding')
sys.path.append(str(catenc))

# Binary classification problem

For Binary classifier we will work with the example 10.2 of T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009.

In [3]:
rs_split = 8379
rs_enc = 1179
rs_rf = 5991

In [4]:
from sklearn.datasets import make_hastie_10_2, make_classification
X_h, y_h = make_classification(n_samples = 1011, n_features=10, n_informative=5, n_redundant=0, 
                               class_sep = 0.01, random_state=2834)

In [5]:
#Now convert the last column to the categorical
from sklearn.preprocessing import KBinsDiscretizer
disczr1 = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')
cat_column1 = disczr1.fit_transform(X_h[:,-1].reshape(-1, 1)) * 193 % 20 #We want to break the monotonicity
disczr2 = KBinsDiscretizer(n_bins=15, encode='ordinal', strategy='uniform')
cat_column2 = disczr2.fit_transform(X_h[:,-2].reshape(-1, 1)) * 173 % 20 #We want to break the monotonicity

In [6]:
predictors = pd.DataFrame(X_h[:, 0:-2], columns=[f'col_{i}' for i in range(8)])
predictors['cat1'] = cat_column1
predictors['cat2'] = cat_column2
#predictors['cat1_orig'] = cat_column1
#predictors['cat2_orig'] = cat_column2
predictors.head(3)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,cat1,cat2
0,0.207933,1.525917,0.527136,-1.295324,-0.38138,-0.577176,-1.053963,-0.002429,17.0,18.0
1,0.592509,0.06699,1.748452,0.277453,3.791661,-1.36997,-0.416096,-0.149276,16.0,5.0
2,-0.653129,0.268849,-0.12144,0.020929,-1.266728,-0.462035,-2.071419,1.237412,5.0,4.0


In [7]:
y_h[y_h<0] = 0

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=rs_split)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
model = RandomForestClassifier(n_estimators=100, max_depth=2, max_features=3, min_samples_leaf=1,
                               random_state=rs_rf, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  0.6534653465346535
Test accuracy:  0.5665024630541872
AUC:  0.5957240038872692


In [10]:
model.feature_importances_

array([0.09049425, 0.09231747, 0.16752997, 0.18313366, 0.08708329,
       0.14120372, 0.07879025, 0.08120745, 0.04244824, 0.0357917 ])

OK, Now we will try to use the probabilistic target encoder


## Cross-validation

We really should use cross-validation to avoid overfitting

### Cross-validation of the target encoding model

First we will train a model using target encoding

In [11]:
%%time
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from category_encoders.leave_one_out import LeaveOneOutEncoder

loo = LeaveOneOutEncoder(cols=['cat1', 'cat2'], sigma=0.05, random_state=2834)
rf = RandomForestClassifier(n_estimators=400, random_state=2834, n_jobs=-1) 
pipe = Pipeline(steps=[('loo',loo), ('rf',rf)])

param_grid = {
    'loo__sigma': [0.01, 0.05, 0.1, 0.2],
    'rf__max_depth': [20,30,40],
    'rf__max_features' : [1,2,3],
    'rf__min_samples_leaf': [1,2,3]
}

X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)

search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1,)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)


Best parameter (CV score=0.630):
{'loo__sigma': 0.1, 'rf__max_depth': 20, 'rf__max_features': 3, 'rf__min_samples_leaf': 1}
Wall time: 5min 45s


In [12]:
test_predict = search.best_estimator_.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, test_predict))

Test accuracy:  0.7044334975369458


### Cross-validation of the probabilistic encoder

First we create a class that makes it easier for us to run sklearn cross validation

In [13]:
from category_encoders.posterior_imputation_bc import PosteriorImputationEncoderBC  
from category_encoders.pte_utils import EncoderWrapper

In [41]:
%%time
from sklearn.model_selection import cross_val_score

pte = PosteriorImputationEncoderBC(cols=['cat1', 'cat2'], n_draws=5, random_state=2834, prior_samples_ratio=0)
model = RandomForestClassifier(n_estimators=400, max_depth=30, max_features=1, 
                               random_state=2834, n_jobs=-1) 
wrapper_model = EncoderWrapper(pte, model)

param_grid = {
    'encoder__leave_one_out': [False, True],
    'encoder__n_draws': [4,5],
    'encoder__prior_samples_ratio': [0, 1E-3],
    'classifier__max_depth': [30,40],
    'classifier__max_features' : [3,4],
    'classifier__min_samples_leaf': [1,2]
}


X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)

search = GridSearchCV(wrapper_model, param_grid, cv=5, n_jobs=-1)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.670):
{'classifier__max_depth': 30, 'classifier__max_features': 4, 'classifier__min_samples_leaf': 1, 'encoder__leave_one_out': False, 'encoder__n_draws': 4, 'encoder__prior_samples_ratio': 0.001}
Wall time: 7min 45s


In [42]:
test_predict = search.best_estimator_.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, test_predict))

Test accuracy:  0.7192118226600985
