In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
import sys,os, pathlib
current = pathlib.Path(os.getcwd())
base = current.parent.parent
catenc = base.joinpath('categorical-encoding')
sys.path.append(str(catenc))

# Binary classification problem

For Binary classifier we will work with the example 10.2 of T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009.

In [3]:
from sklearn.datasets import make_hastie_10_2, make_classification
#X_h, y_h = make_hastie_10_2(random_state=2834)
X_h, y_h = make_classification(n_samples = 200, n_features=10, n_informative=5, n_redundant=0, 
                               class_sep = 0.01, random_state=2834)
#np.random.seed(7931)
#epsilon = 1.5
#random_addition = epsilon * np.random.normal(size=X_h.shape[0]*X_h.shape[1]).reshape(X_h.shape)
#X_h = X_h + random_addition

In [4]:
#Now convert the last column to the categorical
from sklearn.preprocessing import KBinsDiscretizer
disczr1 = KBinsDiscretizer(n_bins=20, encode='ordinal', )
cat_column1 = disczr1.fit_transform(X_h[:,-1].reshape(-1, 1)) * 193 % 20 #We want to break the monotonicity
disczr2 = KBinsDiscretizer(n_bins=15, encode='ordinal', )
cat_column2 = disczr2.fit_transform(X_h[:,-2].reshape(-1, 1)) * 173 % 20 #We want to break the monotonicity

In [5]:
predictors = pd.DataFrame(X_h[:, 0:-2], columns=[f'col_{i}' for i in range(8)])
predictors['cat1'] = cat_column1
predictors['cat2'] = cat_column2
predictors['cat1_orig'] = cat_column1
predictors['cat2_orig'] = cat_column2
predictors.head(3)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,cat1,cat2,cat1_orig,cat2_orig
0,0.549995,0.299362,0.253408,0.78352,-1.389664,0.13274,-0.75253,0.760205,2.0,3.0,2.0,3.0
1,2.973507,-0.514877,-1.170886,0.260715,-1.258356,0.787682,-0.34465,-0.162265,8.0,19.0,8.0,19.0
2,0.008363,-0.607098,0.526846,-0.211193,0.662184,-0.083178,-2.162158,-0.216661,19.0,18.0,19.0,18.0


In [6]:
y_h[y_h<0] = 0
y_h

array([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
model = RandomForestClassifier(n_estimators=400, max_depth=4, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  0.95
Test accuracy:  0.55
AUC:  0.5588972431077694


In [10]:
model.feature_importances_

array([0.06936474, 0.10531054, 0.09402388, 0.07432639, 0.07050119,
       0.09071075, 0.12354668, 0.09424788, 0.0596089 , 0.07576224,
       0.07052982, 0.07206699])

In [11]:
#Now we will try to do target encoding
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.target_encoder import TargetEncoder

loo = LeaveOneOutEncoder(cols=['cat1', 'cat2'], random_state=2834)
loo.fit(pd.DataFrame(X_train, columns=predictors.columns), y_train)
X_train = loo.transform(pd.DataFrame(X_train, columns=predictors.columns))
X_test = loo.transform(pd.DataFrame(X_test, columns=predictors.columns))

model = RandomForestClassifier(n_estimators=400, max_depth=4, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  0.925
Test accuracy:  0.5
AUC:  0.5213032581453634


In [12]:
model.feature_importances_

array([0.04898221, 0.08221393, 0.06163247, 0.0509101 , 0.05466403,
       0.06787002, 0.09716291, 0.06830749, 0.22436523, 0.16057156,
       0.04057006, 0.04275   ])

OK, Now we will try to use the probabilistic target encoder


In [13]:
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
from category_encoders.multiple_imputation_bc import ProbabilisticTargetEncoderForBinaryClassification 
#The class name is really long LOL
pte = ProbabilisticTargetEncoderForBinaryClassification(cols=['cat1', 'cat2'], n_draws=25, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)
pte.fit(X_train, y_train)
X_train = pte.transform(X_train, is_train=True)
X_test = pte.transform(X_test, is_train=True)
y_train = pte.expand_y(y_train)


model = RandomForestClassifier(n_estimators=400, max_depth=10, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]
preds = pte.average_y(preds)

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  1.0
Test accuracy:  0.6
AUC:  0.5839598997493735


Much different story!

In [69]:
#%autoreload 2

X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
from category_encoders.multiple_imputation_bc import ProbabilisticTargetEncoderForBinaryClassification 
#The class name is really long LOL

pte = ProbabilisticTargetEncoderForBinaryClassification(cols=['cat1', 'cat2'], n_draws=25, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)
pte.fit(X_train, y_train)
X_train = pte.transform(X_train, is_train=True)
X_test = pte.transform(X_test, is_train=False)
y_train = pte.expand_y(y_train)

model = RandomForestClassifier(n_estimators=400, max_depth=10, random_state=2834, n_jobs=-1) 
model.fit(X_train, y_train)
preds = pte.predict(X_test, lambda XX: model.predict_proba(XX)[:,1])

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  1.0
Test accuracy:  0.6
AUC:  0.5839598997493735


Same results. Can choose any of the algorithms

In [14]:
model.feature_importances_

array([0.07679319, 0.12915753, 0.0995705 , 0.08480441, 0.07124032,
       0.10826992, 0.13326113, 0.10796657, 0.0292758 , 0.00363572,
       0.07036925, 0.08565568])