## Setup a classification experiment

In [4]:
import warnings
import pandas as pd
warnings.filterwarnings("ignore")

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

df = pd.read_csv(
    "./cs-training-resampled.csv",
    header='infer')

df.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0,0.010493,79,0,0.005852,3075.0,7,0,0,0,0.0
1,0,0.88033,43,0,1.163256,2100.0,6,0,0,0,3.0
2,1,0.132325,76,0,0.150979,10166.0,8,0,2,1,0.0
3,1,1.10842,48,0,0.16522,2880.0,4,2,0,2,2.0
4,1,0.917729,55,2,1.130248,3500.0,13,0,1,1,1.0
5,0,1.0,61,0,0.181652,12600.0,4,0,3,0,1.0
6,0,0.348783,49,0,0.418033,8783.0,12,0,1,0,0.0
7,0,0.091503,27,0,0.119876,9000.0,9,0,0,0,0.0
8,1,0.989609,60,1,0.595626,2148.0,6,0,1,0,0.0
9,0,0.584367,48,0,0.386846,9000.0,14,0,2,0,2.0


In [5]:
X = df[df.columns[1:]]
y = df[df.columns[0]]

feature_names = list(X.columns)

seed = 1  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

X_train.head(10)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
8962,0.155966,43,0,0.655653,4236.0,13,0,1,0,2.0
9870,1.0,42,0,0.0,3660.0,0,0,0,0,1.0
14280,1.0,29,96,0.0,2800.0,0,96,0,96,2.0
12883,1.0,40,98,0.010939,3290.0,0,98,0,98,3.0
13670,0.208463,64,0,0.042128,2800.0,2,0,0,0,0.0
11492,1.122503,34,2,1.800499,400.0,3,1,0,2,1.0
9313,0.405316,29,0,1.086655,576.0,7,0,0,0,0.0
3765,0.72666,47,5,0.208055,9583.0,8,1,1,0,2.0
9305,0.968335,48,7,0.439157,3319.0,11,2,0,0,0.0
717,0.667666,46,0,1.762159,1500.0,8,0,1,0,0.0


In [6]:
y_train.head(10)

8962     0
9870     1
14280    1
12883    1
13670    0
11492    1
9313     0
3765     1
9305     1
717      0
Name: SeriousDlqin2yrs, dtype: int64

## Explore the dataset

In [7]:
from interpret import show
from interpret.data import ClassHistogram

hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)

## Train the Explainable Boosting Machine (EBM)

In [8]:
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

ebm = ExplainableBoostingClassifier(random_state=seed)
ebm.fit(X_train, y_train)   #Works on dataframes and numpy arrays

ExplainableBoostingClassifier(binning_strategy='uniform', data_n_episodes=2000,
                              early_stopping_run_length=50,
                              early_stopping_tolerance=1e-05,
                              feature_names=['RevolvingUtilizationOfUnsecuredLines',
                                             'age',
                                             'NumberOfTime30-59DaysPastDueNotWorse',
                                             'DebtRatio', 'MonthlyIncome',
                                             'NumberOfOpenCreditLinesAndLoans',
                                             'NumberOfTimes90DaysLate',
                                             'NumberRealEstateLoansOrL...
                              feature_types=['continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous

## Global Explanations: What the model learned overall

In [9]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

## Local Explanations: How an individual prediction was made

In [10]:
ebm_local = ebm.explain_local(X_test[:5], y_test[:5], name='EBM')
show(ebm_local)

## Evaluate EBM performance

In [11]:
from interpret.perf import ROC

ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)

## Let's test out a few other Explainable Models

In [12]:
from interpret.glassbox import LogisticRegression, ClassificationTree

# We have to transform categorical variables to use Logistic Regression and Decision Tree
X_enc = pd.get_dummies(X, prefix_sep='.')
feature_names = list(X_enc.columns)
X_train_enc, X_test_enc, y_train, y_test = train_test_split(X_enc, y, test_size=0.20, random_state=seed)

lr = LogisticRegression(random_state=seed, feature_names=feature_names, penalty='l1')
lr.fit(X_train_enc, y_train)

tree = ClassificationTree()
tree.fit(X_train_enc, y_train)

<interpret.glassbox.decisiontree.ClassificationTree at 0x271ea6e37b8>

## Compare performance using the Dashboard

In [13]:
lr_perf = ROC(lr.predict_proba).explain_perf(X_test_enc, y_test, name='Logistic Regression')
tree_perf = ROC(tree.predict_proba).explain_perf(X_test_enc, y_test, name='Classification Tree')

show(lr_perf)
show(tree_perf)
show(ebm_perf)

## Glassbox: All of our models have global and local explanations

In [14]:
lr_global = lr.explain_global(name='LR')
tree_global = tree.explain_global(name='Tree')

show(lr_global)
show(tree_global)
show(ebm_global)

## Dashboard: look at everything at once

In [15]:
# Do everything in one shot with the InterpretML Dashboard by passing a list into show

show([hist, lr_global, lr_perf, tree_global, tree_perf, ebm_global, ebm_perf], share_tables=True)