# Classification example demonstrating stacking and mean-encoding

In [None]:
!pip install catboost

In [None]:
!pip install ipywidgets

In [None]:
from catboost.datasets import amazon
employee_train, employee_test = amazon()

The data is taken from a kaggle competition where catboost was shining:<br>
[https://www.kaggle.com/c/amazon-employee-access-challenge](https://www.kaggle.com/c/amazon-employee-access-challenge)

In [None]:
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
y = employee_train['ACTION']
X = employee_train.drop('ACTION', axis=1)

# Split into train & validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

### catboost

[2018: catboost the new kid on the block from russia](https://arxiv.org/pdf/1810.11363.pdf)

In [None]:
from catboost import CatBoostClassifier
import ipywidgets
cat_features = [*range(8)]
model = CatBoostClassifier(custom_metric=['TotalF1'], early_stopping_rounds=100, eval_metric='AUC')

model.fit(X_train, y_train, cat_features=cat_features,
          eval_set=(X_val, y_val), plot=True, verbose=False, use_best_model=True)


### lightgbm in comparison

Remark, that we do no hyperparameter tuning at all.

In [None]:
import lightgbm
model = lightgbm.LGBMClassifier(metric='auc', n_estimators=5000, learning_rate=0.02, random_state=42)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)],
                      verbose=100, early_stopping_rounds=100)

In [None]:
display(X_train.shape, X_val.shape)

## Now, we add the mean-encoding manually as a preprocessing step

In [None]:
!pip install category_encoders

In [None]:
import sys
import os
import numpy as np
sys.path.append(os.path.abspath('../scripts'))
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from validatedstackedmeanencoder import ValidatedStackedMeanEncoder
numerical = []
categorical = X_train.columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scaler', StandardScaler())])
      
    
preprocessor2 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        #('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical),
        ('mean_enc', ValidatedStackedMeanEncoder(), categorical)
        ])

### without mean-encoding
The setup is the same, except that we pass FunctionTransformer(None) to the categorical variables - everything else is the same.

In [None]:
import lightgbm
from sklearn.preprocessing import FunctionTransformer
model = lightgbm.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.02, 
                                n_estimators=5000, subsample_for_bin=20000, objective='binary', 
                                subsample=1.0, subsample_freq=0, colsample_bytree=1.0, 
                                n_jobs=- 1, silent=True, importance_type='split',
                                is_unbalance = False, scale_pos_weight = 1.0, random_state=42, metric='auc',
                                verbose=1
                              )

custom_pipeline1 = make_pipeline(
            ColumnTransformer(transformers=[('num', numeric_transformer, numerical), 
                                            ('empty', FunctionTransformer(None), categorical)]),
            model
            )
from sklearn.model_selection import StratifiedKFold
custom_pipeline1.fit(X_train, y_train, lgbmclassifier__eval_set=[(X_train, y_train), (X_val, y_val)],
                     lgbmclassifier__early_stopping_rounds=100)
#custom_pipeline.predict(test)
print("model score: %.3f" % custom_pipeline1.score(X_val, y_val))

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_val, custom_pipeline1.predict(X_val)))

In [None]:
print("AUC score: ", roc_auc_score(y_val, custom_pipeline1.predict_proba(X_val)[:, 1]))

### with mean-encoding

we first transform the train and test set for the early stopping:

In [None]:
warnings.filterwarnings('ignore')
new_X = preprocessor2.fit_transform(X_train, y_train)

In [None]:
new_X_val = preprocessor2.transform(X_val)

In [None]:
custom_pipeline = make_pipeline(
            preprocessor2,
            model
            )
from sklearn.model_selection import StratifiedKFold
custom_pipeline.fit(X_train, y_train, lgbmclassifier__eval_set=[(new_X, y_train), (new_X_val, y_val)],
                     lgbmclassifier__early_stopping_rounds=100)
#custom_pipeline.predict(test)
print("model score: %.3f" % custom_pipeline.score(X_val, y_val))

In [None]:
from sklearn.metrics import classification_report,confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score, log_loss

print(classification_report(y_val, custom_pipeline.predict(X_val)))

In [None]:
print("AUC score: ", roc_auc_score(y_val, custom_pipeline.predict_proba(X_val)[:, 1]))

### can we tune the parameters? For example, the learning-rate?

In [None]:
results = []
for lr in 10**np.linspace(-2.5, -0.2, 6):
    custom_pipeline.set_params(lgbmclassifier__learning_rate= lr)
    custom_pipeline.fit(X_train, y_train, lgbmclassifier__eval_set=[(new_X, y_train), (new_X_val, y_val)],
                     lgbmclassifier__early_stopping_rounds=100)
#custom_pipeline.predict(test)
    print("model score: %.3f" % custom_pipeline.score(X_val, y_val))
    results.append(roc_auc_score(y_val, custom_pipeline.predict_proba(X_val)[:, 1]))

In [1]:
list(zip(results, 10**np.linspace(-4, -0.2, 6)))

NameError: name 'results' is not defined