In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
import sys,os, pathlib
current = pathlib.Path(os.getcwd())
base = current.parent.parent
catenc = base.joinpath('categorical-encoding')
sys.path.append(str(catenc))

# Binary classification problem

For Binary classifier we will work with the example 10.2 of T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009.

In [3]:
rs_split = 8379
rs_enc = 1179
rs_rf = 5991

In [4]:
from sklearn.datasets import make_hastie_10_2, make_classification
X_h, y_h = make_classification(n_samples = 200, n_features=10, n_informative=5, n_redundant=0, 
                               class_sep = 0.01, random_state=2834)

In [5]:
#Now convert the last column to the categorical
from sklearn.preprocessing import KBinsDiscretizer
disczr1 = KBinsDiscretizer(n_bins=20, encode='ordinal', )
cat_column1 = disczr1.fit_transform(X_h[:,-1].reshape(-1, 1)) * 193 % 20 #We want to break the monotonicity
disczr2 = KBinsDiscretizer(n_bins=15, encode='ordinal', )
cat_column2 = disczr2.fit_transform(X_h[:,-2].reshape(-1, 1)) * 173 % 20 #We want to break the monotonicity

In [6]:
predictors = pd.DataFrame(X_h[:, 0:-2], columns=[f'col_{i}' for i in range(8)])
predictors['cat1'] = cat_column1
predictors['cat2'] = cat_column2
#predictors['cat1_orig'] = cat_column1
#predictors['cat2_orig'] = cat_column2
predictors.head(3)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,cat1,cat2
0,0.549995,0.299362,0.253408,0.78352,-1.389664,0.13274,-0.75253,0.760205,2.0,3.0
1,2.973507,-0.514877,-1.170886,0.260715,-1.258356,0.787682,-0.34465,-0.162265,8.0,19.0
2,0.008363,-0.607098,0.526846,-0.211193,0.662184,-0.083178,-2.162158,-0.216661,19.0,18.0


In [7]:
y_h[y_h<0] = 0

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=rs_split)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
model = RandomForestClassifier(n_estimators=100, max_depth=2, max_features=3, min_samples_leaf=1,
                               random_state=rs_rf, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  0.8375
Test accuracy:  0.625
AUC:  0.6115288220551378


In [10]:
model.feature_importances_

array([0.06143205, 0.21507391, 0.08475958, 0.0328026 , 0.10763418,
       0.10509297, 0.11649922, 0.11878288, 0.03801434, 0.11990826])

In [11]:
#Now we will try to do target encoding
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.target_encoder import TargetEncoder

loo = LeaveOneOutEncoder(cols=['cat1', 'cat2'], random_state=rs_enc)
loo.fit(pd.DataFrame(X_train, columns=predictors.columns), y_train)
X_train = loo.transform(pd.DataFrame(X_train, columns=predictors.columns))
X_test = loo.transform(pd.DataFrame(X_test, columns=predictors.columns))

model = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=3, min_samples_leaf=2,
                               random_state=rs_rf, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  0.85625
Test accuracy:  0.625
AUC:  0.7368421052631579


In [12]:
model.feature_importances_

array([0.04798213, 0.11854547, 0.04579457, 0.04093998, 0.08018396,
       0.05470348, 0.10540463, 0.06571874, 0.19557273, 0.24515431])

OK, Now we will try to use the probabilistic target encoder


In [13]:
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=rs_split)
from category_encoders.multiple_imputation_bc import ProbabilisticTargetEncoderForBinaryClassification 
#The class name is really long LOL
pte = ProbabilisticTargetEncoderForBinaryClassification(cols=['cat1', 'cat2'], n_draws=25, random_state=rs_enc)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)
pte.fit(X_train, y_train)
X_train = pte.transform(X_train, is_train=True)
X_test = pte.transform(X_test, is_train=True)
y_train = pte.expand_y(y_train)


model = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=4, min_samples_leaf=3,
                               random_state=rs_rf, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]
preds = pte.average_y(preds)

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  0.84925
Test accuracy:  0.475
AUC:  0.4385964912280702


Much different story!

In [16]:
#%autoreload 2

X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=rs_split)
from category_encoders.multiple_imputation_bc import ProbabilisticTargetEncoderForBinaryClassification 
#The class name is really long LOL

pte = ProbabilisticTargetEncoderForBinaryClassification(cols=['cat1', 'cat2'], n_draws=40, random_state=rs_enc)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)
pte.fit(X_train, y_train)
X_train = pte.transform(X_train, is_train=True)
X_test = pte.transform(X_test, is_train=False)
y_train = pte.expand_y(y_train)

model = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=4, min_samples_leaf=3,
                               random_state=rs_rf, n_jobs=-1) 
model.fit(X_train, y_train)
preds = pte.predict(X_test, lambda XX: model.predict_proba(XX)[:,1])

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  0.8471875
Test accuracy:  0.425
AUC:  0.4736842105263158


Same results. Can choose any of the algorithms

In [15]:
model.feature_importances_

array([0.0457551 , 0.36923462, 0.04130399, 0.02401486, 0.14073016,
       0.07438043, 0.10897105, 0.13910944, 0.        , 0.05650036])