In [11]:
import pandas as pd

quick = True             # work with sampled data to reduce computing time
run_gridSearchCV = False # run or not hyperparameters optimization with GridSearchCV()
run_optuna = True        # run or not hyperparameters optimization with Optuna

filename = 'df-light.pkl' if quick else 'df-full.pkl'

df = pd.read_pickle(f'./{filename}')
data = df.iloc[:, 1:]
target = df['grav']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

In [12]:
from catboost import CatBoostClassifier
import shap
shap.initjs()

model = CatBoostClassifier(iterations=100, learning_rate=0.1, random_seed=123, cat_features=list(X_train.columns))
model.fit(X_train, y_train, verbose=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6439660	total: 148ms	remaining: 14.6s
1:	learn: 0.6065889	total: 265ms	remaining: 13s
2:	learn: 0.5747479	total: 396ms	remaining: 12.8s
3:	learn: 0.5475979	total: 526ms	remaining: 12.6s
4:	learn: 0.5306069	total: 594ms	remaining: 11.3s
5:	learn: 0.5125277	total: 698ms	remaining: 10.9s
6:	learn: 0.4992046	total: 795ms	remaining: 10.6s
7:	learn: 0.4890783	total: 885ms	remaining: 10.2s
8:	learn: 0.4809162	total: 982ms	remaining: 9.93s
9:	learn: 0.4735064	total: 1.08s	remaining: 9.71s
10:	learn: 0.4665910	total: 1.17s	remaining: 9.44s
11:	learn: 0.4611538	total: 1.26s	remaining: 9.22s
12:	learn: 0.4568157	total: 1.36s	remaining: 9.08s
13:	learn: 0.4535445	total: 1.44s	remaining: 8.86s
14:	learn: 0.4500882	total: 1.53s	remaining: 8.7s
15:	learn: 0.4451768	total: 1.62s	remaining: 8.49s
16:	learn: 0.4414710	total: 1.7s	remaining: 8.28s
17:	learn: 0.4386444	total: 1.77s	remaining: 8.08s
18:	learn: 0.4360188	total: 1.86s	remaining: 7.92s
19:	learn: 0.4341003	total: 1.95s	remaining: 

<catboost.core.CatBoostClassifier at 0x1aa45e3e3d0>

In [13]:
import time
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.metrics import classification_report_imbalanced

shap.initjs()

if True:
    start_time = time.time()

    model = CatBoostClassifier(cat_features=list(X_train.columns))

    params = {
        'iterations' : [100, 200],
        'learning_rate' : [0.1, 0.5]
    }

    grid = GridSearchCV(estimator=model, param_grid=params, cv = 3, verbose=10, scoring="f1")


    grid.fit(X_train, y_train)

    print('Best score  : ', grid.best_score_)
    print('Best params : ', grid.best_params_)

    y_pred = grid.predict(X_test)
    print(classification_report_imbalanced(y_test, y_pred))

    print(f"model  : {model}")
    print(f"params : {params}")
    print("--- Optimization with GridSearchCV performed in %s seconds ---" % (time.time() - start_time))

    feats = {}
    for feature, importance in zip(X_train.columns, grid.best_estimator_.feature_importances_):
        feats[feature] = importance

    importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

    # 8 variables les plus importantes
    importances.sort_values(by='Gini-importance', ascending=False).head(20)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3; 1/4] START iterations=100, learning_rate=0.01..........................
0:	learn: 0.6879045	total: 125ms	remaining: 12.3s
1:	learn: 0.6829798	total: 218ms	remaining: 10.7s
2:	learn: 0.6779545	total: 333ms	remaining: 10.8s
3:	learn: 0.6727990	total: 429ms	remaining: 10.3s
4:	learn: 0.6677469	total: 512ms	remaining: 9.73s
5:	learn: 0.6630801	total: 611ms	remaining: 9.57s
6:	learn: 0.6585883	total: 702ms	remaining: 9.32s
7:	learn: 0.6541355	total: 771ms	remaining: 8.87s
8:	learn: 0.6497975	total: 854ms	remaining: 8.63s
9:	learn: 0.6453209	total: 937ms	remaining: 8.43s
10:	learn: 0.6414543	total: 1.01s	remaining: 8.21s
11:	learn: 0.6374165	total: 1.11s	remaining: 8.14s
12:	learn: 0.6332440	total: 1.21s	remaining: 8.08s
13:	learn: 0.6293920	total: 1.3s	remaining: 8.02s
14:	learn: 0.6258803	total: 1.39s	remaining: 7.89s
15:	learn: 0.6222061	total: 1.5s	remaining: 7.89s
16:	learn: 0.6186091	total: 1.57s	remaining: 7.68s
17:	

In [14]:
import time
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

if run_optuna:

    start_time = time.time()

    # 1. Define an objective function to be maximized.
    def objective(trial):

        # 2. Suggest values for the hyperparameters using a trial object.
        classifier_name = trial.suggest_categorical('classifier', ['CatBoostClassifier'])
        if classifier_name == 'CatBoostClassifier':
            dt_iterations    = trial.suggest_int('iterations', 100, 200)
            dt_learning_rate = trial.suggest_float('learning_rate', 0.01, 0.1)
            classifier_obj = CatBoostClassifier(iterations=dt_iterations, learning_rate=dt_learning_rate, cat_features=list(X_train.columns))

            score = cross_val_score(classifier_obj, X_train, y_train, cv=3, scoring="f1", verbose=1)
            accuracy = score.mean()

        return accuracy

    # 3. Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10)

    print("--- Optimization with Optuna performed in %s seconds ---" % (time.time() - start_time))

    fig = optuna.visualization.plot_param_importances(study)
    fig.show()

[32m[I 2023-02-17 10:08:36,139][0m A new study created in memory with name: no-name-ac16a0d7-5e18-4ab5-b2ba-3db7a58c6fbe[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6774184	total: 7.79s	remaining: 13m 53s
1:	learn: 0.6633496	total: 8.19s	remaining: 7m 14s
2:	learn: 0.6490210	total: 8.29s	remaining: 4m 50s
3:	learn: 0.6364045	total: 8.39s	remaining: 3m 38s
4:	learn: 0.6245986	total: 8.49s	remaining: 2m 54s
5:	learn: 0.6130130	total: 8.59s	remaining: 2m 26s
6:	learn: 0.6028802	total: 8.67s	remaining: 2m 5s
7:	learn: 0.5919433	total: 8.78s	remaining: 1m 49s
8:	learn: 0.5829650	total: 8.9s	remaining: 1m 37s
9:	learn: 0.5750328	total: 8.99s	remaining: 1m 28s
10:	learn: 0.5671221	total: 9.09s	remaining: 1m 20s
11:	learn: 0.5595223	total: 9.18s	remaining: 1m 13s
12:	learn: 0.5518143	total: 9.27s	remaining: 1m 7s
13:	learn: 0.5449359	total: 9.37s	remaining: 1m 2s
14:	learn: 0.5386183	total: 9.46s	remaining: 58.6s
15:	learn: 0.5326726	total: 9.54s	remaining: 54.9s
16:	learn: 0.5265198	total: 9.63s	remaining: 51.6s
17:	learn: 0.5214536	total: 9.73s	remaining: 48.7s
18:	learn: 0.5167583	total: 9.82s	remaining: 46s
19:	learn: 0.5123642	total: 9.91

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   51.1s finished
[32m[I 2023-02-17 10:09:27,319][0m Trial 0 finished with value: 0.3862953910138946 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 108, 'learning_rate': 0.03042234373843996}. Best is trial 0 with value: 0.3862953910138946.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6780092	total: 77.2ms	remaining: 12.1s
1:	learn: 0.6644303	total: 140ms	remaining: 10.9s
2:	learn: 0.6511778	total: 203ms	remaining: 10.5s
3:	learn: 0.6381573	total: 272ms	remaining: 10.5s
4:	learn: 0.6259944	total: 347ms	remaining: 10.6s
5:	learn: 0.6150152	total: 431ms	remaining: 10.9s
6:	learn: 0.6044423	total: 509ms	remaining: 11s
7:	learn: 0.5940791	total: 584ms	remaining: 11s
8:	learn: 0.5848390	total: 667ms	remaining: 11s
9:	learn: 0.5766232	total: 739ms	remaining: 10.9s
10:	learn: 0.5686004	total: 831ms	remaining: 11.1s
11:	learn: 0.5605570	total: 924ms	remaining: 11.2s
12:	learn: 0.5531063	total: 1.01s	remaining: 11.3s
13:	learn: 0.5468861	total: 1.11s	remaining: 11.4s
14:	learn: 0.5408889	total: 1.2s	remaining: 11.5s
15:	learn: 0.5340935	total: 1.29s	remaining: 11.5s
16:	learn: 0.5290209	total: 1.37s	remaining: 11.4s
17:	learn: 0.5241807	total: 1.44s	remaining: 11.2s
18:	learn: 0.5190805	total: 1.53s	remaining: 11.2s
19:	learn: 0.5145066	total: 1.61s	remaining: 11

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   45.8s finished
[32m[I 2023-02-17 10:10:13,268][0m Trial 1 finished with value: 0.4129966820999483 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 158, 'learning_rate': 0.02925619689930871}. Best is trial 1 with value: 0.4129966820999483.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6800367	total: 129ms	remaining: 20.8s
1:	learn: 0.6681627	total: 219ms	remaining: 17.5s
2:	learn: 0.6564862	total: 327ms	remaining: 17.3s
3:	learn: 0.6448894	total: 420ms	remaining: 16.6s
4:	learn: 0.6339456	total: 527ms	remaining: 16.5s
5:	learn: 0.6239786	total: 635ms	remaining: 16.5s
6:	learn: 0.6142714	total: 740ms	remaining: 16.4s
7:	learn: 0.6046991	total: 833ms	remaining: 16s
8:	learn: 0.5960611	total: 931ms	remaining: 15.8s
9:	learn: 0.5883138	total: 1.01s	remaining: 15.3s
10:	learn: 0.5809002	total: 1.09s	remaining: 15s
11:	learn: 0.5731510	total: 1.19s	remaining: 14.9s
12:	learn: 0.5663799	total: 1.31s	remaining: 15s
13:	learn: 0.5596923	total: 1.41s	remaining: 14.9s
14:	learn: 0.5529461	total: 1.51s	remaining: 14.8s
15:	learn: 0.5477924	total: 1.59s	remaining: 14.5s
16:	learn: 0.5419274	total: 1.69s	remaining: 14.4s
17:	learn: 0.5376139	total: 1.76s	remaining: 14.1s
18:	learn: 0.5325379	total: 1.85s	remaining: 13.9s
19:	learn: 0.5280411	total: 1.94s	remaining: 13

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   48.5s finished
[32m[I 2023-02-17 10:11:01,844][0m Trial 2 finished with value: 0.40350495327442865 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 162, 'learning_rate': 0.025268571509935674}. Best is trial 1 with value: 0.4129966820999483.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6649711	total: 4.97s	remaining: 10m 1s
1:	learn: 0.6412785	total: 5.5s	remaining: 5m 29s
2:	learn: 0.6183570	total: 5.75s	remaining: 3m 47s
3:	learn: 0.5984431	total: 5.93s	remaining: 2m 54s
4:	learn: 0.5818970	total: 6.05s	remaining: 2m 21s
5:	learn: 0.5669276	total: 6.18s	remaining: 1m 59s
6:	learn: 0.5533910	total: 6.29s	remaining: 1m 43s
7:	learn: 0.5407766	total: 6.41s	remaining: 1m 31s
8:	learn: 0.5299729	total: 6.53s	remaining: 1m 21s
9:	learn: 0.5201804	total: 6.63s	remaining: 1m 14s
10:	learn: 0.5113755	total: 6.74s	remaining: 1m 8s
11:	learn: 0.5040605	total: 6.85s	remaining: 1m 2s
12:	learn: 0.4983990	total: 6.95s	remaining: 58.3s
13:	learn: 0.4925796	total: 7.05s	remaining: 54.4s
14:	learn: 0.4876326	total: 7.16s	remaining: 51.1s
15:	learn: 0.4831307	total: 7.26s	remaining: 48.1s
16:	learn: 0.4786805	total: 7.37s	remaining: 45.5s
17:	learn: 0.4744224	total: 7.46s	remaining: 43.1s
18:	learn: 0.4707623	total: 7.55s	remaining: 41s
19:	learn: 0.4667157	total: 7.65s	

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   57.0s finished
[32m[I 2023-02-17 10:11:58,947][0m Trial 3 finished with value: 0.4346656247301099 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 122, 'learning_rate': 0.05545088379864137}. Best is trial 3 with value: 0.4346656247301099.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6669174	total: 122ms	remaining: 23.1s
1:	learn: 0.6446410	total: 201ms	remaining: 19s
2:	learn: 0.6230070	total: 287ms	remaining: 18s
3:	learn: 0.6050258	total: 372ms	remaining: 17.4s
4:	learn: 0.5890453	total: 465ms	remaining: 17.3s
5:	learn: 0.5740678	total: 555ms	remaining: 17.1s
6:	learn: 0.5604667	total: 650ms	remaining: 17.1s
7:	learn: 0.5478009	total: 732ms	remaining: 16.8s
8:	learn: 0.5370236	total: 816ms	remaining: 16.5s
9:	learn: 0.5271934	total: 896ms	remaining: 16.2s
10:	learn: 0.5195417	total: 968ms	remaining: 15.8s
11:	learn: 0.5124045	total: 1.03s	remaining: 15.4s
12:	learn: 0.5062227	total: 1.11s	remaining: 15.2s
13:	learn: 0.5000822	total: 1.19s	remaining: 15s
14:	learn: 0.4946689	total: 1.27s	remaining: 14.9s
15:	learn: 0.4901392	total: 1.35s	remaining: 14.8s
16:	learn: 0.4852349	total: 1.44s	remaining: 14.7s
17:	learn: 0.4804605	total: 1.52s	remaining: 14.6s
18:	learn: 0.4761590	total: 1.6s	remaining: 14.5s
19:	learn: 0.4730243	total: 1.68s	remaining: 14.

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   55.7s finished
[32m[I 2023-02-17 10:12:54,759][0m Trial 4 finished with value: 0.44578206675781945 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 191, 'learning_rate': 0.05147763460734666}. Best is trial 4 with value: 0.44578206675781945.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6838532	total: 96.5ms	remaining: 17.2s
1:	learn: 0.6752838	total: 180ms	remaining: 15.9s
2:	learn: 0.6666949	total: 312ms	remaining: 18.3s
3:	learn: 0.6580356	total: 441ms	remaining: 19.3s
4:	learn: 0.6497117	total: 549ms	remaining: 19.1s
5:	learn: 0.6421837	total: 675ms	remaining: 19.5s
6:	learn: 0.6350623	total: 819ms	remaining: 20.1s
7:	learn: 0.6281441	total: 946ms	remaining: 20.2s
8:	learn: 0.6214874	total: 1.13s	remaining: 21.4s
9:	learn: 0.6152430	total: 1.25s	remaining: 21.1s
10:	learn: 0.6089026	total: 1.36s	remaining: 20.7s
11:	learn: 0.6030361	total: 1.44s	remaining: 20s
12:	learn: 0.5970233	total: 1.53s	remaining: 19.6s
13:	learn: 0.5907804	total: 1.64s	remaining: 19.3s
14:	learn: 0.5850039	total: 1.73s	remaining: 18.9s
15:	learn: 0.5796254	total: 1.83s	remaining: 18.7s
16:	learn: 0.5744694	total: 1.93s	remaining: 18.4s
17:	learn: 0.5699067	total: 2.02s	remaining: 18.1s
18:	learn: 0.5653113	total: 2.11s	remaining: 17.8s
19:	learn: 0.5603968	total: 2.21s	remainin

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   53.1s finished
[32m[I 2023-02-17 10:13:47,936][0m Trial 5 finished with value: 0.38282053495515395 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 179, 'learning_rate': 0.017821906966143577}. Best is trial 4 with value: 0.44578206675781945.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6748986	total: 105ms	remaining: 19.9s
1:	learn: 0.6587732	total: 197ms	remaining: 18.6s
2:	learn: 0.6432902	total: 309ms	remaining: 19.3s
3:	learn: 0.6282611	total: 404ms	remaining: 18.9s
4:	learn: 0.6144398	total: 519ms	remaining: 19.3s
5:	learn: 0.6024504	total: 625ms	remaining: 19.3s
6:	learn: 0.5907181	total: 729ms	remaining: 19.2s
7:	learn: 0.5793841	total: 826ms	remaining: 18.9s
8:	learn: 0.5694989	total: 917ms	remaining: 18.5s
9:	learn: 0.5607924	total: 996ms	remaining: 18s
10:	learn: 0.5526721	total: 1.09s	remaining: 17.8s
11:	learn: 0.5445582	total: 1.19s	remaining: 17.8s
12:	learn: 0.5375025	total: 1.3s	remaining: 17.8s
13:	learn: 0.5306731	total: 1.4s	remaining: 17.7s
14:	learn: 0.5252100	total: 1.49s	remaining: 17.5s
15:	learn: 0.5200833	total: 1.6s	remaining: 17.6s
16:	learn: 0.5149972	total: 1.72s	remaining: 17.6s
17:	learn: 0.5109178	total: 1.79s	remaining: 17.2s
18:	learn: 0.5055152	total: 1.9s	remaining: 17.2s
19:	learn: 0.5015393	total: 2.07s	remaining: 17

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   55.9s finished
[32m[I 2023-02-17 10:14:43,963][0m Trial 6 finished with value: 0.43512658785899694 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 191, 'learning_rate': 0.03541775185906862}. Best is trial 4 with value: 0.44578206675781945.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6853027	total: 105ms	remaining: 12.6s
1:	learn: 0.6780212	total: 172ms	remaining: 10.3s
2:	learn: 0.6706887	total: 256ms	remaining: 10.1s
3:	learn: 0.6632365	total: 329ms	remaining: 9.61s
4:	learn: 0.6560225	total: 407ms	remaining: 9.44s
5:	learn: 0.6494605	total: 477ms	remaining: 9.14s
6:	learn: 0.6432027	total: 548ms	remaining: 8.92s
7:	learn: 0.6370787	total: 607ms	remaining: 8.58s
8:	learn: 0.6308324	total: 673ms	remaining: 8.38s
9:	learn: 0.6248651	total: 746ms	remaining: 8.28s
10:	learn: 0.6193745	total: 822ms	remaining: 8.22s
11:	learn: 0.6137809	total: 891ms	remaining: 8.09s
12:	learn: 0.6083926	total: 963ms	remaining: 8s
13:	learn: 0.6035032	total: 1.03s	remaining: 7.88s
14:	learn: 0.5988876	total: 1.11s	remaining: 7.84s
15:	learn: 0.5940841	total: 1.19s	remaining: 7.83s
16:	learn: 0.5891829	total: 1.28s	remaining: 7.81s
17:	learn: 0.5845844	total: 1.36s	remaining: 7.78s
18:	learn: 0.5801532	total: 1.44s	remaining: 7.71s
19:	learn: 0.5757723	total: 1.52s	remaining:

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   33.2s finished
[32m[I 2023-02-17 10:15:17,280][0m Trial 7 finished with value: 0.33262305099111716 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 121, 'learning_rate': 0.015013690202062911}. Best is trial 4 with value: 0.44578206675781945.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6535066	total: 112ms	remaining: 16.6s
1:	learn: 0.6221389	total: 183ms	remaining: 13.4s
2:	learn: 0.5934279	total: 269ms	remaining: 13.1s
3:	learn: 0.5693660	total: 345ms	remaining: 12.5s
4:	learn: 0.5493582	total: 426ms	remaining: 12.3s
5:	learn: 0.5331454	total: 509ms	remaining: 12.1s
6:	learn: 0.5181883	total: 600ms	remaining: 12.2s
7:	learn: 0.5088831	total: 674ms	remaining: 11.9s
8:	learn: 0.4993976	total: 745ms	remaining: 11.6s
9:	learn: 0.4907854	total: 819ms	remaining: 11.4s
10:	learn: 0.4838918	total: 899ms	remaining: 11.3s
11:	learn: 0.4773836	total: 978ms	remaining: 11.2s
12:	learn: 0.4725316	total: 1.05s	remaining: 11s
13:	learn: 0.4682581	total: 1.13s	remaining: 10.9s
14:	learn: 0.4642811	total: 1.2s	remaining: 10.7s
15:	learn: 0.4599661	total: 1.26s	remaining: 10.5s
16:	learn: 0.4557137	total: 1.34s	remaining: 10.4s
17:	learn: 0.4532390	total: 1.42s	remaining: 10.3s
18:	learn: 0.4505801	total: 1.49s	remaining: 10.2s
19:	learn: 0.4485984	total: 1.56s	remaining:

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   41.5s finished
[32m[I 2023-02-17 10:15:58,902][0m Trial 8 finished with value: 0.45061691904405365 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 149, 'learning_rate': 0.07933512937492866}. Best is trial 8 with value: 0.45061691904405365.[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6559036	total: 117ms	remaining: 15.3s
1:	learn: 0.6260459	total: 216ms	remaining: 13.9s
2:	learn: 0.5985634	total: 328ms	remaining: 14s
3:	learn: 0.5756740	total: 465ms	remaining: 14.7s
4:	learn: 0.5562154	total: 616ms	remaining: 15.5s
5:	learn: 0.5403988	total: 751ms	remaining: 15.6s
6:	learn: 0.5278824	total: 827ms	remaining: 14.6s
7:	learn: 0.5157335	total: 941ms	remaining: 14.5s
8:	learn: 0.5065471	total: 1.03s	remaining: 14s
9:	learn: 0.4985124	total: 1.11s	remaining: 13.5s
10:	learn: 0.4911333	total: 1.2s	remaining: 13.1s
11:	learn: 0.4835220	total: 1.29s	remaining: 12.8s
12:	learn: 0.4771424	total: 1.38s	remaining: 12.5s
13:	learn: 0.4716920	total: 1.47s	remaining: 12.3s
14:	learn: 0.4673224	total: 1.56s	remaining: 12.1s
15:	learn: 0.4639411	total: 1.65s	remaining: 11.8s
16:	learn: 0.4598250	total: 1.72s	remaining: 11.6s
17:	learn: 0.4565777	total: 1.81s	remaining: 11.3s
18:	learn: 0.4545656	total: 1.89s	remaining: 11.1s
19:	learn: 0.4523697	total: 1.98s	remaining: 1

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   38.7s finished
[32m[I 2023-02-17 10:16:37,747][0m Trial 9 finished with value: 0.4459149628521702 and parameters: {'classifier': 'CatBoostClassifier', 'iterations': 131, 'learning_rate': 0.07427113928232822}. Best is trial 8 with value: 0.45061691904405365.[0m


--- Optimization with Optuna performed in 481.60798382759094 seconds ---


In [18]:
model = CatBoostClassifier(iterations=131, learning_rate=0.74, cat_features=list(X_train.columns))

model.fit(X_train, y_train)

# print('Best score  : ', grid.best_score_)
# print('Best params : ', grid.best_params_)

y_pred = model.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

print(f"model  : {model}")
# print(f"params : {params}")
# print("--- Optimization with GridSearchCV performed in %s seconds ---" % (time.time() - start_time))

# feats = {}
# for feature, importance in zip(X_train.columns, grid.best_estimator_.feature_importances_):
#     feats[feature] = importance
#
# importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

# # 8 variables les plus importantes
# importances.sort_values(by='Gini-importance', ascending=False).head(20)

0:	learn: 0.4865895	total: 151ms	remaining: 19.6s
1:	learn: 0.4476654	total: 376ms	remaining: 24.2s
2:	learn: 0.4316433	total: 576ms	remaining: 24.6s
3:	learn: 0.4238667	total: 781ms	remaining: 24.8s
4:	learn: 0.4203675	total: 1.03s	remaining: 25.9s
5:	learn: 0.4172850	total: 1.18s	remaining: 24.6s
6:	learn: 0.4155140	total: 1.33s	remaining: 23.6s
7:	learn: 0.4137883	total: 1.48s	remaining: 22.8s
8:	learn: 0.4125356	total: 1.63s	remaining: 22.1s
9:	learn: 0.4115785	total: 1.77s	remaining: 21.5s
10:	learn: 0.4109725	total: 1.9s	remaining: 20.8s
11:	learn: 0.4101182	total: 2.03s	remaining: 20.2s
12:	learn: 0.4095141	total: 2.16s	remaining: 19.6s
13:	learn: 0.4090497	total: 2.29s	remaining: 19.2s
14:	learn: 0.4086276	total: 2.43s	remaining: 18.8s
15:	learn: 0.4073673	total: 2.59s	remaining: 18.6s
16:	learn: 0.4069760	total: 2.79s	remaining: 18.7s
17:	learn: 0.4065173	total: 3.1s	remaining: 19.4s
18:	learn: 0.4055986	total: 3.28s	remaining: 19.3s
19:	learn: 0.4046619	total: 3.52s	remaining

In [None]:
from catboost import Pool



explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(Pool(X_train, y_train, cat_features=list(X_train.columns)))

In [None]:
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])

In [None]:
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values[0:20, :], X_train.iloc[0:20, :])

In [None]:
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
# shap.dependence_plot("catv", shap_values, X_train)

In [None]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X_train)

In [None]:
test_objects = [X_train.iloc[0:1], X_train.iloc[91:92]]

for obj in test_objects:
    print('Probability of class 1 = {:.4f}'.format(model.predict_proba(obj)[0][1]))
    print('Formula raw prediction = {:.4f}'.format(model.predict(obj, prediction_type='RawFormulaVal')[0]))
    print('\n')