# CATBOOST CLASSIFIER ---------------------------------------------------------------------------

### Get Train, Valid, Test data

In [None]:
import pandas as pd
from my_libs import lib_tools as pt

# run_type = 'dev'
run_type = 'prd'
gen_sample = True
find_best_params = False

columns = ['catv', 'agg', 'dep', 'col', 'catr', 'catu', 'trajet', 'locp', 'circ', 'situ', 'lum', 'age_cls']

X_train, y_train, X_valid, y_valid, X_test, y_test = pt.get_train_valid_test_data(run_type, columns)
print("Train, Valid and Test data loaded")

### Resample data with SMOTEN()

In [None]:
if gen_sample:
    X_train_rs, y_train_rs = pt.get_data_resampled(X=X_train, y=y_train, verbose=1)
    # Save data generated
    X_train_rs.to_pickle(f'./pickles/X_train_smote_{run_type}_{X_train.shape[0]}.pkl')
    y_train_rs.to_pickle(f'./pickles/y_train_smote_{run_type}_{X_train.shape[0]}.pkl')
else:
    # Load data previously generated
    X_train_rs = pd.read_pickle(f'./pickles/X_train_smote_{run_type}_{X_train.shape[0]}.pkl')
    y_train_rs = pd.read_pickle(f'./pickles/y_train_smote_{run_type}_{X_train.shape[0]}.pkl')

pt.plot_data_augmentation(y_train, y_train_rs)

### Find best hyperparameters for model with Optuna (or load model previously fitted)

In [None]:
import time
import optuna
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier

if find_best_params:

    start_time = time.time()

    def objective(trial):

        dt_iterations    = trial.suggest_int('iterations', 50, 2000, log=True)
        dt_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1, log=True)

        classifier_obj = CatBoostClassifier(iterations=dt_iterations, learning_rate=dt_learning_rate, cat_features=list(X_train.columns), verbose=0)
        score = cross_val_score(classifier_obj, X_train_rs, y_train_rs, cv=5, scoring="f1", verbose=1)
        accuracy = score.mean()

        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=5)

    print("--- CatBoost Classifier - Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))
    print(f"Best params : {study.best_params}")
    
    from optuna.visualization import plot_optimization_history
    fig = plot_optimization_history(study)
    fig.show()

### Load and Train model

In [None]:
from my_libs.model_evaluator import ModelEvaluator

if find_best_params:
    # model coming from optuna search
    params = study.best_params
    params['cat_features'] = list(X_train_rs.columns)
else:
    # train best model    
    n_iter = 1400 if run_type == 'prd' else 200
    l_r = 0.0811
    params = {'iterations': n_iter, 'learning_rate': l_r, 'custom_loss': ["Accuracy","AUC"],
              'cat_features': list(X_train_rs.columns)}
    
model = CatBoostClassifier(**params, verbose=0)   
model.fit(X_train_rs, y_train_rs, eval_set=(X_test, y_test), plot=True)

### Confusion Matrix

In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)    
display(pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite']))
print("\nClassification report -------------------------------\n")
print(classification_report(y_test, y_pred))

### Plot loss curve

In [None]:
# Read tsv file learn_error.tsv
data=pd.read_csv('catboost_info/learn_error.tsv',sep='\t')

import seaborn as sns
sns.set_theme(style="darkgrid")

# Plot the responses for different events and regions
axs = sns.lineplot(x="iter", y="Logloss", data=data)
axs.set(ylim=(0,1))
axs.set_title('Logloss during Catboost training');

### Plot ROC curves (from estimator & from predictions)

In [None]:
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
import numpy as np

RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.plot(np.arange(0, 1, 0.01), np.arange(0, 1, 0.01))
plt.title('Catboost - ROC Curve from estimator')

RocCurveDisplay.from_predictions(y_test, model.predict(X_test))
plt.plot(np.arange(0, 1, 0.01), np.arange(0, 1, 0.01))
plt.title('Catboost - ROC Curve from predictions');

### Change proba threshold to improve f1-score

In [None]:
import numpy as np
from sklearn.metrics import classification_report, plot_roc_curve, f1_score, precision_score, recall_score

thresholds = np.arange(0.20, 0.80, 0.01)
scores_f1 = []
scores_prec = []
scores_recall = []
for k in thresholds:
    y_pred = (model.predict_proba(X_test)[:,1] >= k).astype(bool)
    scores_f1.append(f1_score(y_test, y_pred))
    scores_prec.append(precision_score(y_test, y_pred))
    scores_recall.append(recall_score(y_test, y_pred))
    
import matplotlib.pyplot as plt
%matplotlib inline
fig, axs = plt.subplots(2,1, figsize=(7,10))
plt.grid()
axs[0].plot(thresholds, scores_f1, label='f1')
axs[0].set_title("Valeur du f1-score en fonction du seuil de probabilité d'attribution des classes")
axs[1].plot(thresholds, scores_f1, label='f1')
axs[1].plot(thresholds, scores_recall, label='recall')
axs[1].plot(thresholds, scores_prec, label='precision')
axs[1].set_title("Valeur des métriques en fonction du seuil de probabilité d'attribution des classes")
axs[1].grid()
plt.legend();

In [None]:
from sklearn.metrics import classification_report
y_pred = (model.predict_proba(X_test)[:,1] >= 0.42).astype(bool)

display(pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite']))

print("\nClassification report :")
print(classification_report(y_test, y_pred))

In [None]:
from shapash import SmartExplainer
from my_libs import ref_labels
import pickle
import shap
shap.initjs()

xpl = SmartExplainer(
    model=model,
    label_dict=ref_labels.dic_target,
    preprocessing=ref_labels.dic_preproc,
    features_dict=ref_labels.dic_features,  # Optional parameter
    # preprocessing=encoder, # Optional: compile step can use inverse_transform method
    # postprocessing=postprocess # Optional: see tutorial postprocessing
)

y_test.index = X_test.index

xpl.compile(
    x=X_test,
    # y_pred=y_pred, # Optional: for your own prediction (by default: model.predict)
    y_target=y_test, # Optional: allows to display True Values vs Predicted Values
)

app = xpl.run_app()

In [None]:
# app.kill()

### Save model to h5

In [None]:
from joblib import dump
# Save the model to an h5 file using joblib
dump(model, f'h5_models/model_cb_{run_type}_{X_train.shape[0]}.h5')