# CATBOOST CLASSIFIER ---------------------------------------------------------------------------

### Get Train, Valid, Test data

In [None]:
import pandas as pd
from my_libs import lib_tools as pt

# run_type = 'dev'
run_type = 'prd'
gen_sample = False
find_best_params = False

X_train, y_train, X_valid, y_valid, X_test, y_test = pt.get_train_valid_test_data(run_type)
print("Train, Valid and Test data loaded")

### Resample data with SMOTEN()

In [None]:
if gen_sample:
    X_train_rs, y_train_rs = pt.get_data_resampled(X=X_train, y=y_train, verbose=1)
    # Save data generated
    X_train_rs.to_pickle(f'./pickles/X_train_smote_{run_type}_{X_train.shape[0]}.pkl')
    y_train_rs.to_pickle(f'./pickles/y_train_smote_{run_type}_{X_train.shape[0]}.pkl')
else:
    # Load data previously generated
    X_train_rs = pd.read_pickle(f'./pickles/X_train_smote_{run_type}_{X_train.shape[0]}.pkl')
    y_train_rs = pd.read_pickle(f'./pickles/y_train_smote_{run_type}_{X_train.shape[0]}.pkl')

pt.plot_data_augmentation(y_train, y_train_rs)

### Find best hyperparameters for model with Optuna

In [None]:
import time
import optuna
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier

if find_best_params:

    start_time = time.time()

    def objective(trial):

        dt_iterations    = trial.suggest_int('iterations', 50, 300, log=True)
        dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1, log=True)

        classifier_obj = CatBoostClassifier(iterations=dt_iterations, learning_rate=dt_learning_rate, cat_features=list(X_train.columns), verbose=0)
        score = cross_val_score(classifier_obj, X_train_rs, y_train_rs, cv=5, scoring="f1", verbose=1)
        accuracy = score.mean()

        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=5)

    print("--- CatBoost Classifier - Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
    print(f"Best params : {study.best_params}")

In [None]:
if find_best_params:
    from optuna.visualization import plot_optimization_history
    fig = plot_optimization_history(study)
    fig.show()

### Confusion Matrix

In [None]:
from my_libs.model_evaluator import ModelEvaluator

if find_best_params:

    params = study.best_params
    params['cat_features'] = list(X_train.columns)

    evaluator = ModelEvaluator(model_type='CatBoostClassifier', params=params, X_train=X_train_rs, y_train=y_train_rs, X_test=X_test, y_test=y_test)
    model = evaluator.evaluate()

### Fit best model - Plot Train and Test learning curves

In [None]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
%matplotlib inline

# Train the best model
params = study.best_params
params['cat_features'] = list(X_train.columns)
model = CatBoostClassifier(**params)
model.fit(X_train, y_train)

# Compute the learning curve
train_sizes, train_scores, valid_scores = learning_curve(model, X_train_rs, y_train_rs, cv=5, scoring='roc_auc', n_jobs=-1)

# Calculate the mean and standard deviation of the training and validation scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std  = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std  = np.std(valid_scores, axis=1)

# Plot the learning curve
plt.figure(figsize=(8, 6))
plt.title('Catboost Classifier Learning Curve')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy Score')
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color='g')
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training Score')
plt.plot(train_sizes, valid_scores_mean, 'o-', color='g', label='Validation Score')
plt.legend(loc='best')
plt.show()

### Plot ROC curves (from estimator & from predictions)

In [None]:
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.plot(np.arange(0, 1, 0.01), np.arange(0, 1, 0.01))
plt.title('Catboost - ROC Curve from estimator')

RocCurveDisplay.from_predictions(y_test, model.predict(X_test))
plt.plot(np.arange(0, 1, 0.01), np.arange(0, 1, 0.01))
plt.title('Catboost - ROC Curve from predictions');

### Change proba threshold to improve f1-score

In [None]:
import numpy as np
from sklearn.metrics import classification_report, plot_roc_curve, f1_score, precision_score, recall_score
# plot_roc_curve(model, X_train, y_train)

thresholds = np.arange(0.20, 0.80, 0.01)
scores_f1 = []
scores_prec = []
scores_recall = []
for k in thresholds:
    y_pred = (model.predict_proba(X_test)[:,1] >= k).astype(bool)
    scores_f1.append(f1_score(y_test, y_pred))
    scores_prec.append(precision_score(y_test, y_pred))
    scores_recall.append(recall_score(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fig, axs = plt.subplots(2,1, figsize=(7,10))
plt.grid()
axs[0].plot(thresholds, scores_f1, label='f1')
axs[0].set_title("Valeur du f1-score en fonction du seuil de probabilité d'attribution des classes")
axs[0].grid()
axs[1].plot(thresholds, scores_f1, label='f1')
axs[1].plot(thresholds, scores_recall, label='recall')
axs[1].plot(thresholds, scores_prec, label='precision')
axs[1].set_title("Valeur des métriques en fonction du seuil de probabilité d'attribution des classes")
axs[1].grid()
plt.grid()
plt.legend();

In [None]:
from sklearn.metrics import classification_report
y_pred = (model.predict_proba(X_test)[:,1] >= 0.42).astype(bool)

display(pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite']))

print("\nClassification report :")
print(classification_report(y_test, y_pred))

In [None]:
from shapash import SmartExplainer
from my_libs import ref_labels
import pickle
import shap
shap.initjs()

xpl = SmartExplainer(
    model=model,
    label_dict={0: 'Indemne - Blessé léger', 1: 'Tué - Blessé hospitalisé'},
    preprocessing=ref_labels.dic_preproc,
    features_dict=ref_labels.dic_features,  # Optional parameter
    # preprocessing=encoder, # Optional: compile step can use inverse_transform method
    # postprocessing=postprocess # Optional: see tutorial postprocessing
)

y_test.index = X_test.index

xpl.compile(
    x=X_test,
    # y_pred=y_pred, # Optional: for your own prediction (by default: model.predict)
    y_target=y_test, # Optional: allows to display True Values vs Predicted Values
)

app = xpl.run_app()

In [None]:
# app.kill()

### Save model to h5

In [None]:
from joblib import dump
# Save the model to an h5 file using joblib
dump(model, f'h5_models/model_cb_{run_type}_{X_train.shape[0]}.h5')