## step 1: import libraries

In [2]:
# Some auxiliary imports for the tutorial
import sys
import random
import numpy as np
from pprint import pprint
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import shap
import os

In [3]:
# Set seed for reproducibility
np.random.seed(123456)

In [4]:
# Main Contextual AI imports
import xai
from xai.explainer import ExplainerFactory

## Step 2: Train a model on a sample dataset

In [5]:
# Load the dataset and prepare training and test sets
raw_data = datasets.load_breast_cancer()
X, y = raw_data['data'], raw_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Step 3: Instantiate the explainer

In [6]:
# Instantiate a classifier, train, and evaluate on test set
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9649122807017544

In [7]:
raw_data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

## Step 4: Build the explainer 

In [9]:
# Instantiate SHAPTabularExplainer(?) via the ExplainerFactory interface
explainer = ExplainerFactory.get_explainer(domain=xai.DOMAIN.TABULAR, algorithm=xai.ALG.SHAP)

In [10]:
# build the explainer (parameters are different than LIME)
explainer.build_explainer(
    predict_fn=clf.predict_proba,
    training_data=X_train,
    feature_names=raw_data['feature_names']
)

Using 455 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [8]:
raw_data['feature_names']

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

## Step 5: Generate some explanations

In [11]:
clf.predict_proba(X_test[0].reshape(1, -1))

array([[0., 1.]])

In [12]:
exp = explainer.explain_instance(
    instance=X_test[0],
    num_samples=None,
    num_features=10
)

pprint(exp)

SHAP default number of samples[auto]


  0%|          | 0/1 [00:00<?, ?it/s]

{0: {'explanation': [{'feature': 'area error = 12.26',
                      'score': -0.051542587386196326},
                     {'feature': 'worst radius = 10.62',
                      'score': -0.08044701524856596},
                     {'feature': 'worst perimeter = 66.53',
                      'score': -0.08421252061536297},
                     {'feature': 'worst area = 342.9',
                      'score': -0.08011734391966127},
                     {'feature': 'worst concave points = 0.0',
                      'score': -0.07554866469834468}],
     'prediction': 0.0},
 1: {'explanation': [{'feature': 'area error = 12.26',
                      'score': 0.05154258738619644},
                     {'feature': 'worst radius = 10.62',
                      'score': 0.08044701524856623},
                     {'feature': 'worst perimeter = 66.53',
                      'score': 0.08421252061536338},
                     {'feature': 'worst area = 342.9',
                      'scor