# Classification example demonstrating stacking and mean-encoding

In [None]:
import sys
sys.version

In [None]:
import catboost
print(catboost.__version__)

In [None]:
import ipywidgets
print(ipywidgets.__version__)

In [None]:
import graphviz
print(graphviz.__version__)

In [None]:
import numpy
print(numpy.__version__)

In [None]:
import six, matplotlib, plotly, scipy

In [None]:
import pandas
print(pandas.__version__)

In [None]:
# !pip install catboost

In [None]:
# !pip install ipywidgets

## This is the data-set, catboost became famous for:

In [None]:
from catboost.datasets import amazon
employee_train, employee_test = amazon()

The data is taken from a kaggle competition where catboost was shining:<br>
[https://www.kaggle.com/c/amazon-employee-access-challenge](https://www.kaggle.com/c/amazon-employee-access-challenge)

In [None]:
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
y = employee_train['ACTION']
X = employee_train.drop('ACTION', axis=1)

# Split into train & validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

### catboost

[2018: catboost the new kid on the block from russia](https://arxiv.org/pdf/1810.11363.pdf)

In [None]:
from catboost import CatBoostClassifier
# import ipywidgets
cat_features = [*range(8)]
model = CatBoostClassifier(custom_metric=['TotalF1'], early_stopping_rounds=100, eval_metric='AUC')

model.fit(X_train, y_train, cat_features=cat_features,
          eval_set=(X_val, y_val), plot=True, verbose=False, use_best_model=True)


### lightgbm in comparison

Remark, that for catboost, we did no hyperparameter tuning at all.<br>

Now, let's try lightgbm:

__BUT__: since catboost has a early-stopping mechanism, we give the lightgbm-algorithm an eval_set and specify the 'early_stopping_rounds'. When the score on the validation-set is not becoming better than the best current score for the next 'early_stopping_rounds'-iterations, then the algorithm stops and returns the currently best result. We evaluate as well on the trainings-set to get a feeling for the degree of overfitting.

In [None]:
import lightgbm
model = lightgbm.LGBMClassifier(metric='auc', n_estimators=5000, learning_rate=0.02, random_state=42, verbose=100,early_stopping_rounds=100)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)])

In [None]:
display(X_train.shape, X_val.shape)

## Now, we add the mean-encoding manually as a preprocessing step

In [None]:
!pip install category_encoders

### this is how lightgbm could mimick catboost:

Since all variables are categorical, we add an OneHotEncoder. Since catboost became famous for its category-encoding, we add this as a second step. However, doing category-encoding always bears the risk of overfitting. This is why we do it via __stacking__.<br>
If we improve on the solution above it could also be because of the OneHotEncoder and not because of the ValidatedStackedMeanEncoder. Hence, we have to test the pipeline first with just the OneHotEncoder.

In [None]:
import sys
import os
import numpy as np
sys.path.append(os.path.abspath('../scripts'))
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from validatedstackedmeanencoder import ValidatedStackedMeanEncoder
numerical = []
categorical = X_train.columns

# not relevant
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scaler', StandardScaler())])
      
    
preprocessor2 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical),
        ('mean_enc', ValidatedStackedMeanEncoder(), categorical)
        ])

### without mean-encoding
The setup is the same, except that instead of the ValidatedStackedMeanEncoder, we pass FunctionTransformer(None) to the categorical variables - everything else is the same.<br>

__Attentions__: because we want to keep the early_stopping_round (in order to mimick catboost), we first have to one-hot-encode also the validation data set. Otherwise it would have a different number of columns.

In [None]:
import lightgbm
from sklearn.preprocessing import FunctionTransformer
model = lightgbm.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.02, 
                                n_estimators=5000, subsample_for_bin=20000, objective='binary', 
                                subsample=1.0, subsample_freq=0, colsample_bytree=1.0, 
                                n_jobs=- 1, silent=True, importance_type='split',
                                is_unbalance = False, scale_pos_weight = 1.0, random_state=42, metric='auc',
                                verbose=1, early_stopping_rounds=100
                              )


preprocessor1 = ColumnTransformer(transformers=[('num', numeric_transformer, numerical), 
                                            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical),
                                            ('empty', FunctionTransformer(None), categorical)])
custom_pipeline1 = make_pipeline(    
            preprocessor1,
            model
            )
X_train_preprocessed = preprocessor1.fit_transform(X_train)
X_val_preprocessed = preprocessor1.transform(X_val)
from sklearn.model_selection import StratifiedKFold
custom_pipeline1.fit(X_train, y_train, lgbmclassifier__eval_set=[(X_train_preprocessed, y_train), (X_val_preprocessed, y_val)])
#custom_pipeline.predict(test)
print("model score: %.3f" % custom_pipeline1.score(X_val, y_val))

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_val, custom_pipeline1.predict(X_val)))

In [None]:
print("AUC score: ", roc_auc_score(y_val, custom_pipeline1.predict_proba(X_val)[:, 1]))

### Conclusion: the OneHotEncoder did not improve the result

### Now with proper mean-encoding as done in catboost

we first transform the train and test set for the early stopping:

In [None]:
warnings.filterwarnings('ignore')
new_X = preprocessor2.fit_transform(X_train, y_train)

In [None]:
new_X_val = preprocessor2.transform(X_val)

In [None]:
custom_pipeline = make_pipeline(
            preprocessor2,
            model
            )
from sklearn.model_selection import StratifiedKFold
custom_pipeline.fit(X_train, y_train, lgbmclassifier__eval_set=[(new_X, y_train), (new_X_val, y_val)])
#custom_pipeline.predict(test)
print("model score: %.3f" % custom_pipeline.score(X_val, y_val))

In [None]:
from sklearn.metrics import classification_report,confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score, log_loss

print(classification_report(y_val, custom_pipeline.predict(X_val)))

In [None]:
print("AUC score: ", roc_auc_score(y_val, custom_pipeline.predict_proba(X_val)[:, 1]))

### Can we tune the parameters? HPO? - to be left as an exercise

In [None]:
!pwd

In [None]:
import pandas as pd
df = pd.read_csv('/Users/bima/Downloads/train.csv')
df.columns

In [None]:
df[['factor','idhogar','agesq','Target', 'Index', 'Id']]

In [None]:
df2 = pd.read_csv('/Users/bima/Downloads/costa_rica/train.csv')
df2.columns

In [None]:
df_new = df[['Id', 'Target']].merge(df2[['Id', 'Target']], on='Id')

In [None]:
df_new[['Target_x', 'Target_y']]