# Catbost classifier

In [3]:
from pipeline import *

In [6]:
X_train, X_test, y_train, y_test = get_train_test(fname="dataset_v1.csv", balanced=False)
X_train,X_val, y_train,y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=rng)
X_train = pd.DataFrame(preprocessing_num.fit_transform(X_train,y_train),index=X_train.index,columns=preprocessing_num.get_feature_names_out())
X_val = pd.DataFrame(preprocessing_num.transform(X_val),index=X_val.index,columns=preprocessing_num.get_feature_names_out())


set()


In [5]:
from catboost import CatBoostClassifier
cat_features = [i for i in range(len(num_cols_basic+num_cols_imputate),33)]
from sklearn.metrics import make_scorer, f1_score
f1_class_0 = make_scorer(f1_score, pos_label=0)

from sklearn.metrics import confusion_matrix, \
                  classification_report,  precision_score, recall_score, f1_score, average_precision_score

f1_class_0_scorer = make_scorer(f1_score, pos_label=0)
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)
recall_class_0_scorer = make_scorer(recall_score, pos_label=0)
precision_class_0_scorer = make_scorer(precision_score, pos_label=0)
average_precision_score_macro = make_scorer(average_precision_score, average='macro')



We search for the optimal parameters with Optuna

In [30]:
import numpy as np
import optuna
from optuna.integration import CatBoostPruningCallback

import catboost as cb
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split


def objective(trial: optuna.Trial) -> float:
    data, target = X_train, y_train
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss"]),
        "iterations": trial.suggest_int("iterations",300,1500),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1,12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Plain","Ordered"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", [ "Bernoulli", "MVS"]
        ),
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["Balanced"]),
        "used_ram_limit": "25gb",
        "eval_metric": "PRAUC",
        "cat_features": cat_features,
        
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "PRAUC")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=120,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    f1 = f1_score(valid_y, pred_labels,pos_label=0)

    return f1


if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
    )
    study.optimize(objective, n_trials=300, timeout=700)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2023-06-01 16:24:43,976][0m A new study created in memory with name: no-name-28f9b46a-c20b-410c-b627-5b05b4832c57[0m
  pruning_callback = CatBoostPruningCallback(trial, "PRAUC")
[32m[I 2023-06-01 16:24:53,319][0m Trial 0 finished with value: 0.5134196400378909 and parameters: {'objective': 'Logloss', 'iterations': 479, 'colsample_bylevel': 0.014090363231753176, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'auto_class_weights': 'Balanced'}. Best is trial 0 with value: 0.5134196400378909.[0m
  pruning_callback = CatBoostPruningCallback(trial, "PRAUC")
[32m[I 2023-06-01 16:25:07,644][0m Trial 1 finished with value: 0.5204669260700389 and parameters: {'objective': 'Logloss', 'iterations': 758, 'colsample_bylevel': 0.015027530768162203, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'auto_class_weights': 'Balanced'}. Best is trial 1 with value: 0.5204669260700389.[0m
  pruning_callback = CatBoostPruningCallback(trial, "PRAUC")
[32m[I 2023-0

Number of finished trials: 238
Best trial:
  Value: 0.5282476859240345
  Params: 
    objective: Logloss
    iterations: 548
    colsample_bylevel: 0.0466625323732928
    depth: 12
    boosting_type: Ordered
    bootstrap_type: MVS
    auto_class_weights: Balanced


We train a catboost classifier with the parameters found

In [10]:
from catboost import CatBoostClassifier
pipeline = imbPipeline(
    [
        ("classifier", CatBoostClassifier(random_state=rng, iterations=550,objective="Logloss",colsample_bylevel=0.0466,depth=12,boosting_type="Ordered",bootstrap_type="MVS",auto_class_weights="Balanced",subsample=0.661
                                         )),
    ]
)
cat_features = [i for i in range(len(num_cols_basic+num_cols_imputate),33)]
pipeline.fit(X_train, y_train, classifier__cat_features=cat_features,classifier__plot=True)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.089966
0:	learn: 0.6786690	total: 49.1ms	remaining: 27s
1:	learn: 0.6586176	total: 581ms	remaining: 2m 39s
2:	learn: 0.6457912	total: 680ms	remaining: 2m 3s
3:	learn: 0.6331633	total: 780ms	remaining: 1m 46s
4:	learn: 0.6208888	total: 1.31s	remaining: 2m 22s
5:	learn: 0.6110197	total: 1.84s	remaining: 2m 46s
6:	learn: 0.6110197	total: 1.86s	remaining: 2m 24s
7:	learn: 0.6110196	total: 1.87s	remaining: 2m 6s
8:	learn: 0.6045797	total: 2s	remaining: 2m
9:	learn: 0.6014278	total: 2.06s	remaining: 1m 51s
10:	learn: 0.5957031	total: 2.18s	remaining: 1m 46s
11:	learn: 0.5902666	total: 2.37s	remaining: 1m 46s
12:	learn: 0.5896985	total: 2.39s	remaining: 1m 38s
13:	learn: 0.5851644	total: 3.03s	remaining: 1m 55s
14:	learn: 0.5818355	total: 3.18s	remaining: 1m 53s
15:	learn: 0.5818355	total: 3.2s	remaining: 1m 46s
16:	learn: 0.5795386	total: 3.25s	remaining: 1m 41s
17:	learn: 0.5769619	total: 3.31s	remaining: 1m 37s
18:	learn: 0.5741536	total: 3.71s	remaining: 1m 43s
19:	

In [11]:

y_pred = pipeline.predict(X_val)
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.39      0.72      0.51      2318
           1       0.91      0.70      0.79      8760

    accuracy                           0.71     11078
   macro avg       0.65      0.71      0.65     11078
weighted avg       0.80      0.71      0.73     11078

[[1678  640]
 [2601 6159]]
0.7134910527497724
