In [39]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

from eli5 import show_prediction
from eli5.sklearn import PermutationImportance
from eli5.permutation_importance import get_score_importances

df = pd.read_csv(
    "./data/cs-training-resampled.csv",
    header='infer')

df.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0,0.010493,79,0,0.005852,3075.0,7,0,0,0,0.0
1,0,0.88033,43,0,1.163256,2100.0,6,0,0,0,3.0
2,1,0.132325,76,0,0.150979,10166.0,8,0,2,1,0.0
3,1,1.10842,48,0,0.16522,2880.0,4,2,0,2,2.0
4,1,0.917729,55,2,1.130248,3500.0,13,0,1,1,1.0
5,0,1.0,61,0,0.181652,12600.0,4,0,3,0,1.0
6,0,0.348783,49,0,0.418033,8783.0,12,0,1,0,0.0
7,0,0.091503,27,0,0.119876,9000.0,9,0,0,0,0.0
8,1,0.989609,60,1,0.595626,2148.0,6,0,1,0,0.0
9,0,0.584367,48,0,0.386846,9000.0,14,0,2,0,2.0


In [19]:
X = df[df.columns[1:]]
y = df[df.columns[0]]

feature_names = list(X.columns)

seed = 1  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

X_train.head(10)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
8962,0.155966,43,0,0.655653,4236.0,13,0,1,0,2.0
9870,1.0,42,0,0.0,3660.0,0,0,0,0,1.0
14280,1.0,29,96,0.0,2800.0,0,96,0,96,2.0
12883,1.0,40,98,0.010939,3290.0,0,98,0,98,3.0
13670,0.208463,64,0,0.042128,2800.0,2,0,0,0,0.0
11492,1.122503,34,2,1.800499,400.0,3,1,0,2,1.0
9313,0.405316,29,0,1.086655,576.0,7,0,0,0,0.0
3765,0.72666,47,5,0.208055,9583.0,8,1,1,0,2.0
9305,0.968335,48,7,0.439157,3319.0,11,2,0,0,0.0
717,0.667666,46,0,1.762159,1500.0,8,0,1,0,0.0


In [20]:
y_train.head(10)

8962     0
9870     1
14280    1
12883    1
13670    0
11492    1
9313     0
3765     1
9305     1
717      0
Name: SeriousDlqin2yrs, dtype: int64

In [31]:
blackbox_model = RandomForestClassifier(n_estimators=10)
my_trained_model = blackbox_model.fit(X_train, y_train)

In [35]:
perm = PermutationImportance(blackbox_model, scoring = "accuracy", random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = feature_names)

Weight,Feature
0.2274  ± 0.0073,RevolvingUtilizationOfUnsecuredLines
0.1382  ± 0.0069,NumberOfTime30-59DaysPastDueNotWorse
0.1333  ± 0.0067,DebtRatio
0.1228  ± 0.0053,age
0.1204  ± 0.0036,MonthlyIncome
0.1172  ± 0.0035,NumberOfTimes90DaysLate
0.1124  ± 0.0028,NumberOfOpenCreditLinesAndLoans
0.0714  ± 0.0022,NumberOfTime60-89DaysPastDueNotWorse
0.0574  ± 0.0015,NumberRealEstateLoansOrLines
0.0430  ± 0.0028,NumberOfDependents


In [46]:
show_prediction(my_trained_model, X_test.iloc[0], show_feature_values=True, feature_names=feature_names)

Contribution?,Feature,Value
0.494,<BIAS>,1.0
0.175,NumberOfTime30-59DaysPastDueNotWorse,1.0
0.109,DebtRatio,0.773
0.06,NumberRealEstateLoansOrLines,1.0
0.017,NumberOfDependents,4.0
-0.022,age,49.0
-0.031,MonthlyIncome,1760.0
-0.043,NumberOfTime60-89DaysPastDueNotWorse,0.0
-0.063,RevolvingUtilizationOfUnsecuredLines,0.384
-0.078,NumberOfTimes90DaysLate,0.0
