# CATBOOST CLASSIFIER ------------------------------------------------------------------------------------

### Get Train, Valid, Test data

In [3]:
import pandas as pd
from my_libs import lib_tools as pt

# run_type = 'dev'
run_type = 'prd'
gen_sample = True
find_best_params = True

X_train, y_train, X_valid, y_valid, X_test, y_test = pt.get_train_valid_test_data(run_type)
print("Train, valid and Test data loaded")

Train, valid and Test data loaded


### Resample data with SMOTEN()

In [4]:
if gen_sample:
    X_train, y_train = pt.get_data_resampled(X=X_train, y=y_train, verbose=1)
    # Save data generated
    X_train.to_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train.to_pickle(f'./pickles/y_train_smote_{run_type}.pkl')
else:
    # Load data previously generated
    X_train = pd.read_pickle(f'./pickles/X_train_smote_{run_type}.pkl')
    y_train = pd.read_pickle(f'./pickles/y_train_smote_{run_type}.pkl')



--- Smote applied in 1239.2710390090942 seconds ---
Classes cardinality after resampling :
0    97764
1    97764
Name: grav, dtype: int64
X shape : (128000, 27) -> (195528, 27)
y shape : (128000,) -> (195528,)


### Find best hyperparameters for model with Optuna

In [5]:
import time
import optuna
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier

if find_best_params:

    start_time = time.time()

    def objective(trial):

        dt_iterations    = trial.suggest_int('iterations', 50, 300)
        dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)

        classifier_obj = CatBoostClassifier(iterations=dt_iterations, learning_rate=dt_learning_rate, cat_features=list(X_train.columns), verbose=0)
        score = cross_val_score(classifier_obj, X_train, y_train, cv=3, scoring="roc_auc", verbose=1)
        accuracy = score.mean()

        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=5)

    print("--- CatBoost Classifier - Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
    print(f"Best params : {study.best_params}")

[32m[I 2023-02-26 09:37:37,559][0m A new study created in memory with name: no-name-0fbdf5a4-edd0-4481-8d30-761e75ab0625[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.5min finished
[32m[I 2023-02-26 09:42:09,445][0m Trial 0 finished with value: 0.854862300202781 and parameters: {'iterations': 246, 'learning_rate': 0.09650720626063021}. Best is trial 0 with value: 0.854862300202781.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.0min finished
[32m[I 2023-02-26 09:47:11,123][0m Trial 1 finished with value: 0.854304193403134 and parameters: {'iterations': 272, 'learning_rate': 0.07402691830589449}. Best is trial 0 with value: 0.854862300202781.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   51.4s finished


--- CatBoost Classifier - Optimization with Optuna performed in 734.0666670799255 seconds ---
Best params : {'iterations': 246, 'learning_rate': 0.09650720626063021}


In [6]:
if find_best_params:
    from optuna.visualization import plot_optimization_history
    plot_optimization_history(study)

### Confusion Matrix

In [7]:
from my_libs.model_evaluator import ModelEvaluator

if find_best_params:

    params = study.best_params
    params['cat_features'] = list(X_train.columns)

    evaluator = ModelEvaluator(model_type='CatBoostClassifier', params=params, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
    model = evaluator.evaluate()

0:	learn: 0.6639727	total: 464ms	remaining: 1m 53s
1:	learn: 0.6402824	total: 1.06s	remaining: 2m 9s
2:	learn: 0.6198956	total: 1.61s	remaining: 2m 10s
3:	learn: 0.6023591	total: 2.27s	remaining: 2m 17s
4:	learn: 0.5904001	total: 2.83s	remaining: 2m 16s
5:	learn: 0.5784106	total: 3.43s	remaining: 2m 17s
6:	learn: 0.5691879	total: 4.04s	remaining: 2m 18s
7:	learn: 0.5597514	total: 4.59s	remaining: 2m 16s
8:	learn: 0.5517199	total: 5.07s	remaining: 2m 13s
9:	learn: 0.5448881	total: 5.59s	remaining: 2m 11s
10:	learn: 0.5360229	total: 6.12s	remaining: 2m 10s
11:	learn: 0.5312422	total: 6.6s	remaining: 2m 8s
12:	learn: 0.5247815	total: 7.13s	remaining: 2m 7s
13:	learn: 0.5198117	total: 7.69s	remaining: 2m 7s
14:	learn: 0.5144803	total: 8.22s	remaining: 2m 6s
15:	learn: 0.5086136	total: 8.74s	remaining: 2m 5s
16:	learn: 0.5048967	total: 9.23s	remaining: 2m 4s
17:	learn: 0.5002208	total: 9.74s	remaining: 2m 3s
18:	learn: 0.4967693	total: 10.3s	remaining: 2m 3s
19:	learn: 0.4933537	total: 10.9

Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,28769,3610
1,4100,3520



Classification report :
              precision    recall  f1-score   support

           0       0.88      0.89      0.88     32379
           1       0.49      0.46      0.48      7620

    accuracy                           0.81     39999
   macro avg       0.68      0.68      0.68     39999
weighted avg       0.80      0.81      0.80     39999



### Fit best model - Plot Train and Test learning curves

In [None]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
%matplotlib inline

# Train the best model
params = study.best_params
params['cat_features'] = list(X_train.columns)
model = CatBoostClassifier(**params)
model.fit(X_train, y_train)

# Compute the learning curve
train_sizes, train_scores, valid_scores = learning_curve(model, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)

# Calculate the mean and standard deviation of the training and validation scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std  = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std  = np.std(valid_scores, axis=1)

# Plot the learning curve
plt.figure(figsize=(8, 6))
plt.title('Catboost Classifier Learning Curve')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy Score')
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color='g')
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Score')
plt.plot(train_sizes, valid_scores_mean, 'o-', color='g', label='Validation Score')
plt.legend(loc='best')
plt.show()

### Plot ROC curves (from estimator & from predictions)

In [None]:
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.plot(np.arange(0, 1, 0.01), np.arange(0, 1, 0.01))
plt.title('Catboost - ROC Curve from estimator')

RocCurveDisplay.from_predictions(y_test, model.predict(X_test))
plt.plot(np.arange(0, 1, 0.01), np.arange(0, 1, 0.01))
plt.title('Catboost - ROC Curve from predictions');

### Save model to h5

In [None]:
from joblib import dump
# Save the model to an h5 file using joblib
dump(model, f'h5_models/model_cb_{run_type}_200000.h5')