In [None]:
import matplotlib.pyplot as plt
import pandas as pd

quick = True             # work with sampled data to reduce computing time
run_gridSearchCV = False # run or not hyperparameters optimization with GridSearchCV()
run_optuna = True        # run or not hyperparameters optimization with Optuna

filename = 'df-light.pkl' if quick else 'df-full.pkl'

df = pd.read_pickle(f'./{filename}')
data = df.iloc[:, 1:]
target = df['grav']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=222)

In [None]:
from catboost import CatBoostClassifier
import shap
shap.initjs()

model = CatBoostClassifier(iterations=100, learning_rate=0.1, random_seed=123)
model.fit(X_train, y_train, verbose=True, plot=True, cat_features=list(X_train.columns))

In [None]:
from catboost import Pool

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(Pool(X_train, y_train, cat_features=list(X_train.columns)))

In [None]:
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])

In [None]:
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values[0:20, :], X_train.iloc[0:20, :])

In [None]:
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
# shap.dependence_plot("catv", shap_values, X_train)

In [None]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X_train)

In [None]:
test_objects = [X_train.iloc[0:1], X_train.iloc[91:92]]

for obj in test_objects:
    print('Probability of class 1 = {:.4f}'.format(model.predict_proba(obj)[0][1]))
    print('Formula raw prediction = {:.4f}'.format(model.predict(obj, prediction_type='RawFormulaVal')[0]))
    print('\n')