## Setup a classification experiment

In [1]:
import warnings
import pandas as pd
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

df = pd.read_csv(
    "./data/cs-training-resampled.csv",
    header='infer')

df.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0,0.010493,79,0,0.005852,3075.0,7,0,0,0,0.0
1,0,0.88033,43,0,1.163256,2100.0,6,0,0,0,3.0
2,1,0.132325,76,0,0.150979,10166.0,8,0,2,1,0.0
3,1,1.10842,48,0,0.16522,2880.0,4,2,0,2,2.0
4,1,0.917729,55,2,1.130248,3500.0,13,0,1,1,1.0
5,0,1.0,61,0,0.181652,12600.0,4,0,3,0,1.0
6,0,0.348783,49,0,0.418033,8783.0,12,0,1,0,0.0
7,0,0.091503,27,0,0.119876,9000.0,9,0,0,0,0.0
8,1,0.989609,60,1,0.595626,2148.0,6,0,1,0,0.0
9,0,0.584367,48,0,0.386846,9000.0,14,0,2,0,2.0


In [2]:
X = df[df.columns[1:]]
y = df[df.columns[0]]

feature_names = list(X.columns)

seed = 1  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

X_train.head(10)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
8962,0.155966,43,0,0.655653,4236.0,13,0,1,0,2.0
9870,1.0,42,0,0.0,3660.0,0,0,0,0,1.0
14280,1.0,29,96,0.0,2800.0,0,96,0,96,2.0
12883,1.0,40,98,0.010939,3290.0,0,98,0,98,3.0
13670,0.208463,64,0,0.042128,2800.0,2,0,0,0,0.0
11492,1.122503,34,2,1.800499,400.0,3,1,0,2,1.0
9313,0.405316,29,0,1.086655,576.0,7,0,0,0,0.0
3765,0.72666,47,5,0.208055,9583.0,8,1,1,0,2.0
9305,0.968335,48,7,0.439157,3319.0,11,2,0,0,0.0
717,0.667666,46,0,1.762159,1500.0,8,0,1,0,0.0


In [3]:
y_train.head(10)

8962     0
9870     1
14280    1
12883    1
13670    0
11492    1
9313     0
3765     1
9305     1
717      0
Name: SeriousDlqin2yrs, dtype: int64

## Train a blackbox classification system

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#Blackbox system can include preprocessing, not just a classifier!
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

blackbox_model = Pipeline([('rf', rf)])
blackbox_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('rf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=-1,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

## Show blackbox model performance

In [5]:
from interpret import show
from interpret.perf import ROC

blackbox_perf = ROC(blackbox_model.predict_proba).explain_perf(X_test, y_test, name='Blackbox')
show(blackbox_perf)

## Local Explanations: How an individual prediction was made

In [6]:
from interpret.blackbox import LimeTabular
from interpret import show

#Blackbox explainers need a predict function, and optionally a dataset
lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train, random_state=1)

#Pick the instances to explain, optionally pass in labels if you have them
lime_local = lime.explain_local(X_test[:10], y_test[:10], name='LIME')

show(lime_local)

In [7]:
from interpret.blackbox import ShapKernel
import numpy as np

background_val = np.median(X_train, axis=0).reshape(1, -1)
shap = ShapKernel(predict_fn=blackbox_model.predict_proba, data=background_val, feature_names=feature_names)
shap_local = shap.explain_local(X_test[:10], y_test[:10], name='SHAP')
show(shap_local)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




## Global Explanations: How the model behaves overall

In [8]:
from interpret.blackbox import MorrisSensitivity

sensitivity = MorrisSensitivity(predict_fn=blackbox_model.predict_proba, data=X_train)
sensitivity_global = sensitivity.explain_global(name="Global Sensitivity")

show(sensitivity_global)

In [9]:
from interpret.blackbox import PartialDependence

pdp = PartialDependence(predict_fn=blackbox_model.predict_proba, data=X_train)
pdp_global = pdp.explain_global(name='Partial Dependence')

show(pdp_global)

## Compare them all in the Dashboard

In [10]:
show([blackbox_perf, lime_local, shap_local, sensitivity_global, pdp_global])