In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
import sys,os, pathlib
current = pathlib.Path(os.getcwd())
base = current.parent.parent
catenc = base.joinpath('categorical-encoding')
sys.path.append(str(catenc))

# Binary classification problem

For Binary classifier we will work with the example 10.2 of T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009.

In [3]:
from sklearn.datasets import make_hastie_10_2
X_h, y_h = make_hastie_10_2(random_state=2834)

In [4]:
#Now convert the last column to the categorical
from sklearn.preprocessing import KBinsDiscretizer
disczr1 = KBinsDiscretizer(n_bins=20, encode='ordinal', )
cat_column1 = disczr1.fit_transform(X_h[:,-1].reshape(-1, 1)) * 193 % 20 #We want to break the monotonicity
disczr2 = KBinsDiscretizer(n_bins=15, encode='ordinal', )
cat_column2 = disczr2.fit_transform(X_h[:,-2].reshape(-1, 1)) * 173 % 20 #We want to break the monotonicity

In [5]:
predictors = pd.DataFrame(X_h[:, 0:-2], columns=[f'col_{i}' for i in range(8)])
predictors['cat1'] = cat_column1
predictors['cat2'] = cat_column2
predictors.head(3)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,cat1,cat2
0,-1.372591,-2.090973,1.708557,-0.2752,-0.398902,1.02447,-0.765034,-0.189323,5.0,10.0
1,0.46918,1.482655,0.573892,1.517456,-0.036815,-0.18815,-0.654887,1.071954,11.0,18.0
2,-0.405521,0.231156,-1.036971,-0.901881,-2.526267,0.429153,-1.177047,-0.425995,11.0,4.0


In [6]:
y_h[y_h<0] = 0
y_h

array([1., 0., 1., ..., 1., 0., 1.])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
model = RandomForestClassifier(n_estimators=400, max_depth=20, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict(X_test)

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds))

Train accuracy:  0.9766666666666667
Test accuracy:  0.8654166666666666


In [9]:
#Now we will try to do target encoding
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.target_encoder import TargetEncoder

loo = LeaveOneOutEncoder(cols=['cat1', 'cat2'], random_state=2834)
loo.fit(pd.DataFrame(X_train, columns=predictors.columns), y_train)
X_train = loo.transform(pd.DataFrame(X_train, columns=predictors.columns))
X_test = loo.transform(pd.DataFrame(X_test, columns=predictors.columns))

model = RandomForestClassifier(n_estimators=400, max_depth=17, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict(X_test)

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds))

Train accuracy:  0.9864583333333333
Test accuracy:  0.88625


OK, Now we will try to use the probabilistic target encoder


In [24]:
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
from category_encoders.multiple_imputation_bc import ProbabilisticTargetEncoderForBinaryClassification 
#The class name is really long LOL
pte = ProbabilisticTargetEncoderForBinaryClassification(cols=['cat1', 'cat2'], n_draws=25, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)
pte.fit(X_train, y_train)
X_train = pte.transform(X_train, is_train=True)
X_test = pte.transform(X_test, is_train=True)
y_train = pte.expand_y(y_train)


model = RandomForestClassifier(n_estimators=400, max_depth=30, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
#preds = pte.predict(X_test, model)
preds = model.predict_proba(X_test)[:,1]
preds = pte.average_y(preds)

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))

Train accuracy:  1.0
Test accuracy:  0.8291666666666667


The accuracy sucks. But we don't have to sample during the prediction, we can just predict for both 0 and 1 and then take the average based on the probabilities

In [23]:
%autoreload 2

X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
from category_encoders.multiple_imputation_bc import ProbabilisticTargetEncoderForBinaryClassification 
#The class name is really long LOL

pte = ProbabilisticTargetEncoderForBinaryClassification(cols=['cat1', 'cat2'], n_draws=50, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)
pte.fit(X_train, y_train)
X_train = pte.transform(X_train, is_train=True)
X_test = pte.transform(X_test, is_train=False)
y_train = pte.expand_y(y_train)

model = RandomForestClassifier(n_estimators=400, max_depth=30, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = pte.predict(X_test, lambda XX: model.predict_proba(XX)[:,1])

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))

Train accuracy:  0.99455
Test accuracy:  0.8304166666666667
