In [0]:
#only necessary if shap is not installed yet:

!pip install shap

This notebook computes the SHAP values for each tree from a random forest separately. A comparison with the aggregated values shows perfect agreement

In [0]:
from sklearn import datasets 
#import pandas as pd
import numpy as np
np.random.seed(0)
#import matplotlib.pyplot as plt
import shap
from sklearn.ensemble import RandomForestRegressor

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

In [4]:
rf = RandomForestRegressor(max_depth=50, random_state=0, n_estimators=100,max_features=2)
rf.fit(diabetes_X, diabetes_y) 

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=50, max_features=2, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

Get the SHAP values for each individual tree:

In [0]:
n,p = diabetes_X.shape
k=0
shap_values_IndTrees = np.zeros((n, p, rf.n_estimators))
for tree in rf.estimators_:
  tree_preds = tree.predict(diabetes_X)
  explainer = shap.TreeExplainer(tree)
  shap_values_IndTrees[:,:,k] = explainer.shap_values(diabetes_X)
  k+=1

Get the SHAP values for the forest:

In [6]:
shap_values = shap.TreeExplainer(rf).shap_values(diabetes_X)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Compare

In [0]:
shap_averages = np.mean(shap_values_IndTrees, axis=2)

In [8]:
shap_averages.shape

(442, 10)

In [9]:
shap_averages[0:5,0:9]

array([[  2.44335946,  -3.00956769,  18.36632084,  -4.52236466,
         -6.68071779,  -0.09489346,   2.41334275,   2.42628273,
          9.55394142],
       [ -4.72060762,   2.63365264, -15.83532914,  -4.47230068,
         -2.49039149,  -2.38146477, -11.47550156,  -6.49119954,
        -20.93121356],
       [  3.47316568,  -3.15843022,  16.64062786, -11.95558607,
         -5.64077853,   0.4712413 ,   7.07770315,   3.18009745,
         -0.56640426],
       [  3.12227647,   5.00219994,  -7.25117706,  -4.69872581,
          4.55956987,   4.53644078,   8.28755635,  10.30790057,
         21.48217336],
       [ -2.49409811,   5.21729804, -20.13254819,   3.97585958,
          1.91013783,  -0.09446261,  -0.1879328 ,   3.60192769,
        -12.58971653]])

In [10]:
np.mean(np.abs(shap_values-shap_averages))

2.5155671365092662e-14

In [11]:
shap_values[0:5,0:9]

array([[  2.44335946,  -3.00956769,  18.36632084,  -4.52236466,
         -6.68071779,  -0.09489346,   2.41334275,   2.42628273,
          9.55394142],
       [ -4.72060762,   2.63365264, -15.83532914,  -4.47230068,
         -2.49039149,  -2.38146477, -11.47550156,  -6.49119954,
        -20.93121356],
       [  3.47316568,  -3.15843022,  16.64062786, -11.95558607,
         -5.64077853,   0.4712413 ,   7.07770315,   3.18009745,
         -0.56640426],
       [  3.12227647,   5.00219994,  -7.25117706,  -4.69872581,
          4.55956987,   4.53644078,   8.28755635,  10.30790057,
         21.48217336],
       [ -2.49409811,   5.21729804, -20.13254819,   3.97585958,
          1.91013783,  -0.09446261,  -0.1879328 ,   3.60192769,
        -12.58971653]])

The following observation is puzzling: if I explicitly call `tree.predict()`, the `shap.TreeExplainer(tree)` prints its message "*Setting feature_perturbation ...*" for each iteration in the for loop, which "proves" to me that it is being executed each time. But not when the `tree.predict()` is commented out. In that case, the message is printed only once, and I am worried that `shap.TreeExplainer(tree)` is not really executed every time?

In [14]:
for tree in rf.estimators_:
  #tree_preds = tree.predict(diabetes_X)
  explainer = shap.TreeExplainer(tree)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
