In [1]:
%load_ext autoreload
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
import sys,os, pathlib
current = pathlib.Path(os.getcwd())
base = current.parent.parent
catenc = base.joinpath('categorical-encoding')
sys.path.append(str(catenc))

# Binary classification problem



In [19]:
rs_split = 8379
rs_enc = 1179
rs_rf = 5991
n_samples = 10000

In [20]:
from sklearn.datasets import make_classification
X_h, y_h = make_classification(n_samples = n_samples, n_features=10, n_informative=5, n_redundant=0, 
                               class_sep = 0.01, random_state=2834)

In [21]:
#Now convert the last column to the categorical
from sklearn.preprocessing import KBinsDiscretizer
disczr1 = KBinsDiscretizer(n_bins=20, encode='ordinal', strategy='uniform')
cat_column1 = disczr1.fit_transform(X_h[:,-1].reshape(-1, 1)) * 193 % 20 #We want to break the monotonicity
disczr2 = KBinsDiscretizer(n_bins=15, encode='ordinal', strategy='uniform')
cat_column2 = disczr2.fit_transform(X_h[:,-2].reshape(-1, 1)) * 173 % 20 #We want to break the monotonicity

In [22]:
predictors = pd.DataFrame(X_h[:, 0:-2], columns=[f'col_{i}' for i in range(8)])
predictors['cat1'] = cat_column1
predictors['cat2'] = cat_column2
#predictors['cat1_orig'] = cat_column1
#predictors['cat2_orig'] = cat_column2
predictors.head(3)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,cat1,cat2
0,-0.487295,-0.147595,0.702684,-0.980724,-2.212259,-0.7054,1.487825,0.482706,16.0,18.0
1,0.639721,-1.63564,0.643382,0.953232,-0.178569,0.332062,2.076369,-0.756664,16.0,11.0
2,-0.894759,1.810931,1.427439,-0.638438,2.661236,-0.263417,1.35936,-0.759298,16.0,10.0


In [23]:
y_h[y_h<0] = 0

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=rs_split)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
model = RandomForestClassifier(n_estimators=100, max_depth=2, max_features=3, min_samples_leaf=1,
                               random_state=rs_rf, n_jobs=-1) 
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:,1]

print('Train accuracy: ', accuracy_score(y_train, model.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, preds.round()))
print('AUC: ', roc_auc_score(y_test, preds))

Train accuracy:  0.68975
Test accuracy:  0.6735
AUC:  0.7250086016859303


In [26]:
model.feature_importances_

array([0.00766255, 0.12502227, 0.20512611, 0.18482931, 0.01124271,
       0.4187756 , 0.00490739, 0.0112219 , 0.        , 0.03121215])

OK, Now we will try to use the probabilistic target encoder


## Cross-validation

We really should use cross-validation to avoid overfitting

### Cross-validation of the target encoding model

First we will train a model using target encoding

In [27]:
%%time
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from category_encoders.leave_one_out import LeaveOneOutEncoder

loo = LeaveOneOutEncoder(cols=['cat1', 'cat2'], sigma=0.05, random_state=2834)
rf = RandomForestClassifier(n_estimators=400, random_state=2834, n_jobs=-1) 
pipe = Pipeline(steps=[('loo',loo), ('rf',rf)])

param_grid = {
    'loo__sigma': [0.01, 0.05, 0.1, 0.2],
    'rf__max_depth': [20,30,40],
    'rf__max_features' : [1,2,3],
    'rf__min_samples_leaf': [1,2,3]
}

X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)

search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1,)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)


Best parameter (CV score=0.811):
{'loo__sigma': 0.01, 'rf__max_depth': 20, 'rf__max_features': 3, 'rf__min_samples_leaf': 1}
Wall time: 5min 42s


In [28]:
test_predict = search.best_estimator_.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, test_predict))

Test accuracy:  0.8005


### Cross-validation of the probabilistic encoder

First we create a class that makes it easier for us to run sklearn cross validation

In [29]:
from category_encoders.posterior_imputation_bc import PosteriorImputationEncoderBC  
from category_encoders.pte_utils import EncoderWrapper

In [30]:
%%time
from sklearn.model_selection import cross_val_score

pte = PosteriorImputationEncoderBC(cols=['cat1', 'cat2'], n_draws=5, random_state=2834, prior_samples_ratio=0)
model = RandomForestClassifier(n_estimators=400, max_depth=30, max_features=1, 
                               random_state=2834, n_jobs=-1) 
wrapper_model = EncoderWrapper(pte, model)

param_grid = {
    'encoder__leave_one_out': [False, True],
    'encoder__n_draws': [4,5],
    'encoder__prior_samples_ratio': [0, 1E-3],
    'classifier__max_depth': [30,40],
    'classifier__max_features' : [3,4],
    'classifier__min_samples_leaf': [1,2]
}


X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)

search = GridSearchCV(wrapper_model, param_grid, cv=5,)# n_jobs=-1)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.800):
{'classifier__max_depth': 30, 'classifier__max_features': 3, 'classifier__min_samples_leaf': 2, 'encoder__leave_one_out': False, 'encoder__n_draws': 4, 'encoder__prior_samples_ratio': 0}
Wall time: 21min 36s


In [31]:
test_predict = search.best_estimator_.predict(X_test)
print('Test accuracy: ', accuracy_score(y_test, test_predict))

Test accuracy:  0.791


### Study how hyperparameters influence the model performance

#### Leave one out or not?

In [17]:
pte = PosteriorImputationEncoderBC(cols=['cat1', 'cat2'], 
                                   n_draws=search.best_params_['encoder__n_draws'], 
                                   random_state=2834, 
                                   prior_samples_ratio=search.best_params_['encoder__prior_samples_ratio'],
                                   leave_one_out=search.best_params_['encoder__leave_one_out'])
model = RandomForestClassifier(n_estimators=400, 
                               max_depth=search.best_params_['classifier__max_depth'], 
                               max_features=search.best_params_['classifier__max_features'], 
                               min_samples_leaf=search.best_params_['classifier__min_samples_leaf'], 
                               random_state=2834, n_jobs=-1) 
wrapper_model = EncoderWrapper(pte, model)

param_grid = {
    'encoder__leave_one_out': [False, True],
}

X_train, X_test, y_train, y_test = train_test_split(predictors.values, y_h, test_size=0.2, random_state=2834)
X_train = pd.DataFrame(X_train, columns=predictors.columns)
X_test = pd.DataFrame(X_test, columns=predictors.columns)

search1 = GridSearchCV(wrapper_model, param_grid, cv=5, n_jobs=-1)
search1.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search1.best_score_)
print(search1.best_params_)

Best parameter (CV score=0.644):
{'encoder__leave_one_out': True}


In [18]:
search1.cv_results_

{'mean_fit_time': array([1.60519848, 2.56519513]),
 'std_fit_time': array([0.49861409, 0.34268267]),
 'mean_score_time': array([0.99540052, 0.41940403]),
 'std_score_time': array([0.3781206 , 0.27647686]),
 'param_encoder__leave_one_out': masked_array(data=[False, True],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'encoder__leave_one_out': False},
  {'encoder__leave_one_out': True}],
 'split0_test_score': array([0.67901235, 0.67901235]),
 'split1_test_score': array([0.62962963, 0.63580247]),
 'split2_test_score': array([0.60493827, 0.64197531]),
 'split3_test_score': array([0.67701863, 0.67701863]),
 'split4_test_score': array([0.57142857, 0.58385093]),
 'mean_test_score': array([0.63240549, 0.64353194]),
 'std_test_score': array([0.04157612, 0.03465316]),
 'rank_test_score': array([2, 1])}