In [None]:
%pylab inline

In [None]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import numpy as np
import shap

# Parameters

You can get the `deployed_model_id` in the URL of the deployed model:

[DSS_URL]/savedmodels/**{deployed_model_id}**/p/{full_id}

Example: `'8OBZXffh'`





In [None]:
deployed_model_id             = ''
read_limit                    = 1000  # None to interpret all rows
max_variables_used_to_explain = 15

# Load deployed model

In [None]:
model = dataiku.Model(lookup=deployed_model_id, project_key=dataiku.get_custom_variables()['projectKey'])

## Show available versions

In [None]:
def show_versions(model, selected_idx=None):
    print("{}\n{}\n".format(model.get_name(), '='*len(model.get_name())))

    N = range(len(model.list_versions())) if selected_idx is None else [selected_idx]
    
    for idx in N:
        model_version_name = model.versions[idx]['snippet']['userMeta']['name']
        model_version_id   = model.versions[idx]['versionId']
        version_is_active = model.versions[idx]['active']

        version_url = '[DSS_URL]/projects/{proj}/savedmodels/{lookup}/p/{full_id}/#summary'.format(
            proj=model.project_key,
            lookup=model.lookup,
            full_id=model.versions[idx]['snippet'][u'fullModelId'])

        print('INDEX:   {}\nActive:  {}\nProject: {}\nLookup:  {}\nName:    {}\nVersion: {}'.format(idx, version_is_active, model.project_key, model.lookup, model_version_name, model_version_id))
        print('URL:     {}\n'.format(version_url))
        
show_versions(model)

To override the selected version please replace the following variable by the index you select.
By default it will select the `active` version of the saved model.

In [None]:
active_version_idx = [idx for idx, version in enumerate(model.list_versions()) if version['active']][0]

In [None]:
print("Selected version...\n")
show_versions(model, selected_idx=active_version_idx)

## Get predictor

In [None]:
predictor = model.get_predictor(version_id=model.versions[active_version_idx]['versionId'])

# Load and process the the dataset

In [None]:
dku_dataset = dataiku.Dataset('__INPUT_DATASET_SMART_NAME__')
df = dku_dataset.get_dataframe(limit=read_limit)

In [None]:
df_processed = pd.DataFrame(predictor.preprocess(df)[0], columns=predictor.get_features())

# Interpret

Create an explainer and get Shap values

In [None]:
tree_explainer = shap.TreeExplainer(predictor._clf)
shap_values = tree_explainer.shap_values(df_processed)

## Global interpretation

In [None]:
shap_values_list = shap_values if isinstance(shap_values, list) else [shap_values]
classes = predictor.get_classes() if len(predictor.get_classes()) else None

for idx, values in enumerate(shap_values_list):
    if classes is None:                    # Regression
        print('Explaining regression\n{}'.format('='*21))
        _current_class = []
    else:                                  # Classification
        if len(classes) == 2:              # Binary classification
            _current_class = classes[1]
        else:                              # Multi-class classification
            _current_class = classes[idx]
        print('Explaining class: {}\n{}'.format(_current_class, '='*(18 + len(_current_class))))
    
    shap.summary_plot(values, df_processed, max_display=max_variables_used_to_explain, plot_type="bar", class_names=_current_class)
    shap.summary_plot(values, df_processed, max_display=max_variables_used_to_explain, plot_type="dot", class_names=_current_class)
    #shap.summary_plot(values, df_processed, max_display=max_variables_used_to_explain, plot_type="violin", class_names=_current_class)


## Sample interpretation

Interpret sample `0`. You can change the sample to interpret.

In [None]:
sample_idx = 0

expected_values = tree_explainer.expected_value if isinstance(tree_explainer.expected_value, list) else [tree_explainer.expected_value]
sample_plot = []
for idx, values in enumerate(shap_values_list):
    if classes is None:                    # Regression
        print('INDEX {} = Regression explanation'.format(idx))
        _current_class = []
    else:                                  # Classification
        if len(classes) == 2:              # Binary classification
            _current_class = classes[1]
        else:                              # Multi-class classification
            _current_class = classes[idx]
        print('INDEX {} = {}'.format(idx, _current_class))
    
    sample_plot.append(shap.force_plot(expected_values[idx], values[sample_idx,:], df_processed.iloc[sample_idx,:], link="logit"))

print('\n\nExecute sample_plot[INDEX] to see result')

In [None]:
sample_plot[0]