In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#Import Tree Models from scratch functions
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/TreeModelsFromScratch")

from DecisionTree import DecisionTree
from RandomForest import RandomForest

In [3]:
import numpy as np
import pandas as pd
import scipy
import shap
import sklearn
import graphviz

# Regression

## Example of loading a custom tree model into SHAP
https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/tree_based_models/Example%20of%20loading%20a%20custom%20tree%20model%20into%20SHAP.html

In [None]:
X,y = shap.datasets.boston()

orig_model = sklearn.tree.DecisionTreeRegressor(max_depth=2, random_state=42)
orig_model.fit(X, y)

In [None]:
dot_data = sklearn.tree.export_graphviz(orig_model, node_ids=True, out_file=None, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
# extract the arrays that define the tree
children_left = orig_model.tree_.children_left
children_right = orig_model.tree_.children_right
children_default = children_right.copy() # because sklearn does not use missing values
features = orig_model.tree_.feature
thresholds = orig_model.tree_.threshold
values = orig_model.tree_.value.reshape(orig_model.tree_.value.shape[0], 1)
node_sample_weight = orig_model.tree_.weighted_n_node_samples

print("     children_left", children_left) # note that negative children values mean this is a leaf node
print("    children_right", children_right)
print("  children_default", children_default)
print("          features", features)
print("        thresholds", thresholds.round(3))
print("            values", values.round(3))
print("node_sample_weight", node_sample_weight)

In [None]:
# define a custom tree model
tree_dict = {
    "children_left": children_left,
    "children_right": children_right,
    "children_default": children_default,
    "features": features,
    "thresholds": thresholds,
    "values": values,
    "node_sample_weight": node_sample_weight
}
model = {
    "trees": [tree_dict]
}

In [None]:
explainer = shap.TreeExplainer(model)

In [None]:
# Make sure that the ingested SHAP model (a TreeEnsemble object) makes the
# same predictions as the original model
assert np.abs(explainer.model.predict(X) - orig_model.predict(X)).max() < 1e-4

In [None]:
# make sure the SHAP values sum up to the model output (this is the local accuracy property)
assert np.abs(explainer.expected_value + explainer.shap_values(X).sum(1) - orig_model.predict(X)).max() < 1e-4

## Transfer to `TreeModelsfromScratch`

In [None]:
# Train DecisionTree from scratch
reg_tree = DecisionTree(max_depth=2, treetype="regression",random_state=42)
reg_tree.fit(X, y)

In [None]:
reg_tree.node_id_dict

In [None]:
# implement children left and children right logic
# replace "None" for leaf_node with -1
# replace "None" for features in leaf node with ""-2"
# values in array of arrays of shape (no_nodes, 1)

In [None]:
chi_left = []
chi_right = []

# go over all nodes
for node in reg_tree.node_list:
        
    # find child node id of corresponding node 
    if node.left is not None:
        chi_left.append(node.left.id)
    # if leaf return -1
    else:
        chi_left.append(-1)
        
    # find child node id of corresponding node 
    if node.right is not None:
        chi_right.append(node.right.id)
    # if leaf return -1
    else:
        chi_right.append(-1)
        
chi_left = np.array(chi_left)
chi_right = np.array(chi_right)
chi_default = chi_right.copy()

In [None]:
print(chi_left)
print(children_left)

In [None]:
print(chi_default)
print(children_default)

In [None]:
# replace "None" for features in leaf node with ""-2"
feats = np.array([node.feature if node.feature is not None else -2 for node in reg_tree.node_list])

In [None]:
print(feats)
print(features)

In [None]:
# replace "None" for features in leaf node with ""-2"
thres = np.array([node.threshold if node.threshold is not None else -2 for node in reg_tree.node_list])

In [None]:
print(thres)
print(thresholds)

In [None]:
# values in array of arrays of shape (no_nodes, 1)
vals = np.array([node.value for node in reg_tree.node_list])
vals = vals.reshape(vals.shape[0], 1)

In [None]:
vals

In [None]:
samples = np.array([float(node.samples) for node in reg_tree.node_list])

In [None]:
samples

In [None]:
node_sample_weight

Lets put it in a function:

In [None]:
def export_tree_for_SHAP(tree):
    
    # Children
    children_left = []
    children_right = []

    # go over all nodes
    for node in tree.node_list:

        # find child node id of corresponding node 
        if node.left is not None:
            children_left.append(node.left.id)
        # if leaf return -1
        else:
            children_left.append(-1)

        # find child node id of corresponding node 
        if node.right is not None:
            children_right.append(node.right.id)
        # if leaf return -1
        else:
            children_right.append(-1)

    children_left = np.array(children_left)
    children_right = np.array(children_right)
    children_default = children_right.copy()
    
    #features: replace "None" for features in leaf node with ""-2"
    features = np.array([node.feature if node.feature is not None else -2 for node in tree.node_list])
    
    # Thresholds: replace "None" for thres in leaf node with ""-2"
    thresholds = np.array([node.threshold if node.threshold is not None else -2 for node in tree.node_list])
    
    # values in array of arrays of shape (no_nodes, 1)
    if tree.treetype=="regression":
        values = np.array([node.value for node in tree.node_list])
    elif tree.treetype=="classification":
        values = np.array([node.clf_prob_dis[1] for node in tree.node_list])
    values = values.reshape(vals.shape[0], 1)
    
    #samples
    samples = np.array([float(node.samples) for node in tree.node_list])
    
    # define a custom tree model
    tree_dict = {
        "children_left": children_left,
        "children_right": children_right,
        "children_default": children_default,
        "features": features,
        "thresholds": thresholds,
        "values": values,
        "node_sample_weight": samples
    }
    model = {
        "trees": [tree_dict]
    }
    
    return model

In [None]:
export_model = export_tree_for_SHAP(reg_tree)

In [None]:
# test export model
explainer = shap.TreeExplainer(export_model)

In [None]:
# Make sure that the ingested SHAP model (a TreeEnsemble object) makes the
# same predictions as the original model
assert np.abs(explainer.model.predict(X) - reg_tree.predict(X)).max() < 1e-4

In [None]:
# make sure the SHAP values sum up to the model output (this is the local accuracy property)
assert np.abs(explainer.expected_value + explainer.shap_values(X).sum(1) - reg_tree.predict(X)).max() < 1e-4

In [None]:
def verify_shap_model(reg_tree, explainer):
    
    # Make sure that the ingested SHAP model (a TreeEnsemble object) makes the same predictions as the original model
    assert np.abs(explainer.model.predict(X) - reg_tree.predict(X)).max() < 1e-4
    
    # make sure the SHAP values sum up to the model output (this is the local accuracy property)
    assert np.abs(explainer.expected_value + explainer.shap_values(X).sum(1) - reg_tree.predict(X)).max() < 1e-4

In [None]:
verify_shap_model(reg_tree, explainer)

# Classification

## Example of loading a custom tree model into SHAP
https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/tree_based_models/Example%20of%20loading%20a%20custom%20tree%20model%20into%20SHAP.html

In [None]:
X,y = sklearn.datasets.load_breast_cancer(return_X_y=True, as_frame=True)

orig_model_clf = sklearn.tree.DecisionTreeRegressor(max_depth=2)
orig_model_clf.fit(X, y)

In [None]:
dot_data = sklearn.tree.export_graphviz(orig_model_clf, feature_names=X.columns, node_ids=True, out_file=None, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
# extract the arrays that define the tree
children_left = orig_model_clf.tree_.children_left
children_right = orig_model_clf.tree_.children_right
children_default = children_right.copy() # because sklearn does not use missing values
features = orig_model_clf.tree_.feature
thresholds = orig_model_clf.tree_.threshold
values = orig_model_clf.tree_.value.reshape(orig_model_clf.tree_.value.shape[0], orig_model_clf.tree_.n_classes[0])
#values = np.argmax(orig_model_clf.tree_.value.reshape(orig_model_clf.tree_.value.shape[0], orig_model_clf.tree_.n_classes[0]),axis=1).reshape(orig_model_clf.tree_.value.shape[0], 1)
node_sample_weight = orig_model_clf.tree_.weighted_n_node_samples

print("     children_left", children_left) # note that negative children values mean this is a leaf node
print("    children_right", children_right)
print("  children_default", children_default)
print("          features", features)
print("        thresholds", thresholds.round(3))
print("            values", values.round(3))
print("node_sample_weight", node_sample_weight)

In [None]:
# define a custom tree model
tree_dict = {
    "children_left": children_left,
    "children_right": children_right,
    "children_default": children_default,
    "features": features,
    "thresholds": thresholds,
    "values": values,
    "node_sample_weight": node_sample_weight
}
model = {
    "trees": [tree_dict]
}

In [None]:
explainer = shap.TreeExplainer(model, model_output="raw")

In [None]:
# Make sure that the ingested SHAP model (a TreeEnsemble object) makes the
# same predictions as the original model
assert np.abs(explainer.model.predict(X) - orig_model_clf.predict(X)).max() < 1e-4

In [None]:
# make sure the SHAP values sum up to the model output (this is the local accuracy property)
assert np.abs(explainer.expected_value + explainer.shap_values(X).sum(1) - orig_model_clf.predict(X)).max() < 1e-4

## Transfer to `TreeModelsfromScratch`

In [None]:
# Train DecisionTree from scratch
clf_tree = DecisionTree(max_depth=2, treetype="classification", random_state=42)
clf_tree.fit(X, y)

In [None]:
clf_tree.node_id_dict

In [None]:
dot_data = sklearn.tree.export_graphviz(orig_model_clf, feature_names=X.columns, node_ids=True, out_file=None, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph

Only values is different for classification

In [None]:
def export_tree_for_SHAP(tree):
    
    # Children
    children_left = []
    children_right = []

    # go over all nodes
    for node in tree.node_list:

        # find child node id of corresponding node 
        if node.left is not None:
            children_left.append(node.left.id)
        # if leaf return -1
        else:
            children_left.append(-1)

        # find child node id of corresponding node 
        if node.right is not None:
            children_right.append(node.right.id)
        # if leaf return -1
        else:
            children_right.append(-1)

    children_left = np.array(children_left)
    children_right = np.array(children_right)
    children_default = children_right.copy()
    
    #features: replace "None" for features in leaf node with ""-2"
    features = np.array([node.feature if node.feature is not None else -2 for node in tree.node_list])
    
    # Thresholds: replace "None" for thres in leaf node with ""-2"
    thresholds = np.array([node.threshold if node.threshold is not None else -2 for node in tree.node_list])
    
    # values in array of arrays of shape (no_nodes, 1)
    if tree.treetype=="regression":
        values = np.array([node.value for node in tree.node_list])
    elif tree.treetype=="classification":
        values = np.array([node.clf_prob_dis[1] for node in tree.node_list])
    values = values.reshape(values.shape[0], 1)
    
    #samples
    samples = np.array([float(node.samples) for node in tree.node_list])
    
    # define a custom tree model
    tree_dict = {
        "children_left": children_left,
        "children_right": children_right,
        "children_default": children_default,
        "features": features,
        "thresholds": thresholds,
        "values": values,
        "node_sample_weight": samples
    }
    model = {
        "trees": [tree_dict]
    }
    
    return model

In [None]:
export_model_clf = export_tree_for_SHAP(clf_tree)

In [None]:
# test export model
explainer_clf = shap.TreeExplainer(export_model_clf, model_output="raw")

In [None]:
def verify_shap_model(tree, explainer):
    
    # Make sure that the ingested SHAP model (a TreeEnsemble object) makes the same predictions as the original model
    assert np.abs(explainer.model.predict(X)- tree.predict_proba(X)).max() < 1e-4
    
    # make sure the SHAP values sum up to the model output (this is the local accuracy property)
    assert np.abs(explainer.expected_value + explainer.shap_values(X).sum(1) - tree.predict_proba(X)).max() < 1e-4

In [None]:
verify_shap_model(clf_tree, explainer_clf)

# Test functionality of implemented class functions in package

## Regression

In [5]:
X,y = shap.datasets.boston()

Function load_boston is deprecated; `load_boston` is deprecated in 1.0 and will be removed in 1.2.

    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    datase

In [6]:
# Train DecisionTree from scratch
reg_tree = DecisionTree(max_depth=2, treetype="regression",random_state=42)
reg_tree.fit(X, y)

In [7]:
export_model = reg_tree.export_tree_for_SHAP()

In [8]:
# Create explainer
explainer = shap.TreeExplainer(export_model)

In [9]:
reg_tree.verify_shap_model(explainer, X)

works :) 

## Classification

In [10]:
X_cancer,y_cancer = sklearn.datasets.load_breast_cancer(return_X_y=True, as_frame=True)

In [11]:
# Train DecisionTree from scratch
clf_tree = DecisionTree(max_depth=2, treetype="classification", random_state=42)
clf_tree.fit(X_cancer,y_cancer)

In [12]:
export_model_clf = clf_tree.export_tree_for_SHAP()

In [13]:
# Create explainer
explainer_clf = shap.TreeExplainer(export_model_clf)

In [15]:
clf_tree.verify_shap_model(explainer_clf, X_cancer)

Also works :) 