In [1]:
import shap
import warnings
import numpy as np
import packaging.version as pv

from pandas import DataFrame
from tqdm import tqdm
from lightgbm import LGBMRegressor
from onnxruntime import InferenceSession
from skl2onnx import to_onnx, update_registered_converter
from skl2onnx.common.shape_calculator import (
    calculate_linear_regressor_output_shapes,
)  # noqa
from onnxmltools import __version__ as oml_version
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import (
    convert_lightgbm,
)  # noqa

from sklearn import datasets

  from .autonotebook import tqdm as notebook_tqdm


fontes:
- LGBM to onnx: https://onnx.ai/sklearn-onnx/auto_tutorial/plot_gexternal_lightgbm_reg.html
- 

In [2]:
def skl2onnx_convert_lightgbm(scope, operator, container):
    options = scope.get_options(operator.raw_operator)
    if "split" in options:
        if pv.Version(oml_version) < pv.Version("1.9.2"):
            warnings.warn(
                "Option split was released in version 1.9.2 but %s is "
                "installed. It will be ignored." % oml_version
            )
        operator.split = options["split"]
    else:
        operator.split = None
    convert_lightgbm(scope, operator, container)


update_registered_converter(
    LGBMRegressor,
    "LightGbmLGBMRegressor",
    calculate_linear_regressor_output_shapes,
    skl2onnx_convert_lightgbm,
    options={"split": None},
)

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class OnnxWrapper(BaseEstimator, TransformerMixin):

    def __init__(self, onnx_model):
        self.onnx_model = onnx_model

    def predict(self, X: np.array):
        sess = InferenceSession(
            self.onnx_model.SerializeToString(), providers=["CPUExecutionProvider"]
        )
        X32 = X.astype(np.float32)
        return sess.run(None, {"X": X32})[0].ravel()

    def __call__(self, X: np.array):
        return self.predict(X)

In [4]:
X_train, y_train = datasets.load_diabetes(return_X_y=True)

reg = LGBMRegressor(n_estimators=1000)
reg = reg.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 682
[LightGBM] [Info] Number of data points in the train set: 442, number of used features: 10
[LightGBM] [Info] Start training from score 152.133484


In [6]:
onnx_model = to_onnx(
    reg, X_train[:1].astype(np.float32), target_opset={"": 14, "ai.onnx.ml": 2}
)

In [8]:
# selecionando o registro para predicao
X_input = X_train[5:6, :]
X_input

array([[-0.09269548, -0.04464164, -0.04069594, -0.01944183, -0.06899065,
        -0.07928784,  0.04127682, -0.0763945 , -0.04117617, -0.09634616]])

In [9]:
%%time
explainer = shap.TreeExplainer(reg, X_train)
original_shap_values = explainer(X_input)
original_shap_values

CPU times: user 384 ms, sys: 3.78 ms, total: 388 ms
Wall time: 386 ms


.values =
array([[  7.06523254,  12.10109805, -24.39586175,  -9.85503942,
         10.12580085,  -5.87424749, -15.34937774,   5.02595222,
        -30.06721312,  -2.29335377]])

.base_values =
array([151.56347135])

.data =
array([[-0.09269548, -0.04464164, -0.04069594, -0.01944183, -0.06899065,
        -0.07928784,  0.04127682, -0.0763945 , -0.04117617, -0.09634616]])

In [11]:
%%time
wrapper_model = OnnxWrapper(onnx_model=onnx_model)

masker = shap.maskers.Independent(X_train)
explainer = shap.Explainer(wrapper_model, masker=masker, seed=42)
onnx_shap_values = explainer(X_input)
onnx_shap_values

CPU times: user 3.38 s, sys: 722 μs, total: 3.38 s
Wall time: 859 ms


.values =
array([[  7.0652272 ,  12.10109902, -24.39585419,  -9.8550451 ,
         10.12579065,  -5.8742469 , -15.34938626,   5.02594593,
        -30.06721906,  -2.29336548]])

.base_values =
array([151.56347141])

.data =
array([[-0.09269548, -0.04464164, -0.04069594, -0.01944183, -0.06899065,
        -0.07928784,  0.04127682, -0.0763945 , -0.04117617, -0.09634616]])

# Observações #

O TreeExplainer não funciona com o wrapper

In [12]:
%%time
explainer = shap.TreeExplainer(onnx_model, X_train)
original_shap_values = explainer(X_input)
original_shap_values

InvalidModelError: Model type not yet supported by TreeExplainer: <class 'onnx.onnx_ml_pb2.ModelProto'>

Se não passar os dados de treino pro TreeExplainer com o modelo original o shap dá um pouco diferente. Vale entender o motivo disso. Eu acredito que seja por conta do base_value que é calculado de maneira diferente.

In [13]:
explainer = shap.TreeExplainer(reg)
original_shap_values = explainer(X_input)
original_shap_values

.values =
array([[  4.87720016,  11.12880686, -19.92972008,  -8.21268509,
          9.95869733,  -6.00093473, -13.27200151,   4.94822778,
        -34.24172816,  -3.34288258]])

.base_values =
array([152.13348416])

.data =
array([[-0.09269548, -0.04464164, -0.04069594, -0.01944183, -0.06899065,
        -0.07928784,  0.04127682, -0.0763945 , -0.04117617, -0.09634616]])