## Explain a Model

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#Load diabetes dataset
print("Loading Data...")
data = pd.read_csv('data/diabetes.csv')

# Separate features and labels
features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']
labels=['not-diabetic','diabetic']
X, y = data[features].values, data['Diabetic'].values

#Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))

## Get an explainer for the model
Tabular Explainer, which is a "black box" explainer that can be used to explain many kinds of model by invoking an appropriate SHAP model explainer.

In [2]:
from interpret.ext.blakbox import TabularExplainer
# "features" and "classes" fields are optional
tab_explainer = TabularExplainer(model, 
                             X_train, 
                             features=features, 
                             classes=labels)

ModuleNotFoundError: No module named 'interpret'

## Rank Feature Importance

In [3]:
#You can use training data or test data here
global_tab_explanation = tab_explainer.explain_global(X_train)

#Get the top features by importance
global_tab_feature_importance = global_tab_explanation.get_feature_importance_dict()
for feature, importance in global_tab_feature_importance.items():
    print(feature,":", importance)

NameError: name 'tab_explainer' is not defined

## Adding explainability to Azure ML Models
Procedures include connect to workspace, train a model and get its feature importance.

In [4]:
import os, shutil
from azureml.core import Experiment

# Create a folder for the experiment files
experiment_folder = 'diabetes_train_and_explain'
os.makedirs(experiment_folder, exist_ok=True)

# Copy the data file into the experiment folder
shutil.copy('data/diabetes.csv', os.path.join(experiment_folder, "diabetes.csv"))

Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception unknown locale: UTF-8.


FileNotFoundError: [Errno 2] No such file or directory: 'data/diabetes.csv'

In [5]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Import Azure ML run library
from azureml.core.run import Run

# Import libraries for model explanation
from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient
from interpret.ext.blackbox import TabularExplainer

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
data = pd.read_csv('diabetes.csv')

features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']
labels = ['not-diabetic', 'diabetic']

# Separate features and labels
X, y = data[features].values, data['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
run.log('AUC', np.float(auc))
os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes.pkl')

# Get explanation
explainer = TabularExplainer(model, X_train, features=features, classes=labels)
explanation = explainer.explain_global(X_test)

# Get an Explanation Client and upload the explanation
explain_client = ExplanationClient.from_run(run)
explain_client.upload_model_explanation(explanation, comment='Tabular Explanation')

# Complete the run
run.complete()

Writing diabetes_train_and_explain/diabetes_training.py


In [None]:
from azureml.train.estimator import Estimator
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
env = Environment('diabetes-interpret-env')
env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
env.docker.enabled = True # Use a docker container

# Create a set of package dependencies (including the azureml-contrib-interpret package)
packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas'],
                                    pip_packages=['azureml-defaults','azureml-interpret','azureml-contrib-interpret'])

# Add the dependencies to the environment
env.python.conda_dependencies = packages

# Create an estimator
estimator = Estimator(source_directory=experiment_folder,
              compute_target = 'local', # Use local compute
              environment_definition = env,
              entry_script='diabetes_training.py')

# Run the experiment
experiment = Experiment(workspace = ws, name = 'diabetes_train_and_explain')
run = experiment.submit(config=estimator)
RunDetails(run).show()
run.wait_for_completion()

## Retrieve Feature Importance Values
You can also view the feature importance in azure ml studio: tabular_explanation-->global importance--->summary importance--->local feature importance

In [None]:
from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient

# Get the feature explanations
client = ExplanationClient.from_run(run)
engineered_explanations = client.download_model_explanation()
feature_importances = engineered_explanations.get_feature_importance_dict()

# Overall feature importance
print('Feature\tImportance')
for key, value in feature_importances.items():
    print(key, '\t', value)