In [None]:
import pandas as pd
from my_libs import lib_tools as pt

run_smote = False
run_type = 'dev'
# run_type = 'prd'

if run_type == 'dev': filename_train, filename_test = 'pickles/df-dev-train.pkl', 'pickles/df-dev-test.pkl'
if run_type == 'prd': filename_train, filename_test = 'pickles/df-prd-train.pkl', 'pickles/df-prd-test.pkl'

columns = ['place', 'catu', 'sexe', 'trajet', 'locp', 'actp', 'etatp', 'mois',
           'lum', 'agg', 'int', 'atm', 'col', 'dep', 'catr', 'circ', 'nbv', 'vosp',
           'prof', 'plan', 'surf', 'infra', 'situ', 'senc', 'catv', 'age_cls',
           'joursem']
columns.remove('senc')

X_train, y_train, X_test, y_test, X_test_final, y_test_final = pt.get_train_valid_test_data(filename_train, filename_test, columns)

In [None]:
import pandas as pd
import time
from imblearn.over_sampling import SMOTEN

if run_smote:
    sampler = SMOTEN()

    start_time = time.time()
    X_train, y_train = sampler.fit_resample(X_train, y_train)
    print(f"X_train : {X_train.shape} - y_train : {y_train.shape}")

    X_train['actp'] = X_train['actp'].astype('int')
    X_train['dep'] = X_train['dep'].astype('int')

    print(f"--- Smote applied in %s seconds ---" % (time.time() - start_time))

    X_train.to_pickle('./pickles/X_train_cb_no_senc.pkl')
    y_train.to_pickle('./pickles/y_train_cb_no_senc.pkl')
else:
    X_train = pd.read_pickle(f'./pickles/X_train_cb_no_senc.pkl')
    y_train = pd.read_pickle(f'./pickles/y_train_cb_no_senc.pkl')

In [None]:
from my_libs.model_evaluator import ModelEvaluator

params = {'iterations': 150, 'learning_rate': 0.0811, 'random_seed': 123, 'cat_features': list(X_train.columns)}
evaluator = ModelEvaluator(model_type='CatBoostClassifier', params=params, X_train=X_train, y_train=y_train, X_test=X_test_final, y_test=y_test_final)
model = evaluator.evaluate()


In [None]:
import numpy as np
from sklearn.metrics import classification_report, plot_roc_curve, f1_score, precision_score, recall_score
# plot_roc_curve(model, X_train, y_train)

thresholds = np.arange(0.20, 0.80, 0.01)
scores_f1 = []
scores_prec = []
scores_recall = []
for k in thresholds:
    y_pred = (model.predict_proba(X_test)[:,1] >= k).astype(bool)
    scores_f1.append(f1_score(y_test, y_pred))
    scores_prec.append(precision_score(y_test, y_pred))
    scores_recall.append(recall_score(y_test, y_pred))

In [None]:
y_pred = (model.predict_proba(X_test)[:,1] >= 0.4).astype(bool)

display(pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite']))

print("\nClassification report :")
print(classification_report(y_test, y_pred))

In [None]:
plot_roc_curve(model, X_test_final, y_test_final)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fig, axs = plt.subplots(2,1, figsize=(7,10))
plt.grid()
axs[0].plot(thresholds, scores_f1, label='f1')
axs[0].set_title("Valeur du f1-score en fonction du seuil de probabilité d'attribution des classes")
axs[0].grid()
axs[1].plot(thresholds, scores_f1, label='f1')
axs[1].plot(thresholds, scores_recall, label='recall')
axs[1].plot(thresholds, scores_prec, label='precision')
axs[1].set_title("Valeur des métriques en fonction du seuil de probabilité d'attribution des classes")
axs[1].grid()
plt.grid()
plt.legend();

In [None]:
from shapash import SmartExplainer
import pickle
import shap
shap.initjs()

xpl = SmartExplainer(
    model=model,
    # features_dict=house_dict,  # Optional parameter
    # preprocessing=encoder, # Optional: compile step can use inverse_transform method
    # postprocessing=postprocess # Optional: see tutorial postprocessing
)

y_test_final.index = X_test_final.index

xpl.compile(
    x=X_test_final,
    # y_pred=y_pred, # Optional: for your own prediction (by default: model.predict)
    y_target=y_test_final, # Optional: allows to display True Values vs Predicted Values
)

app = xpl.run_app()

In [None]:
# app.kill()