In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from interpret import show
from interpret.glassbox import ExplainableBoostingClassifier
import warnings

In [25]:
warnings.filterwarnings('ignore')

For this work it was used publicly available data from LendingClub.com. It represents 9578 3-year loans that were funded through the LendingClub.com platform between May 2007 and February 2010. It's composed by 14 features, the target variable being "not.fully.paid". This variable indicates that the loan was not paid back in full (i.e., the borrower either defaulted or the loan was "charged off," meaning the borrower was deemed unlikely to ever pay it back).

In [26]:
data = pd.read_csv('loans (1).csv')

In [27]:
data

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0.0,0.0,0.0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0.0,0.0,0.0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1.0,0.0,0.0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1.0,0.0,0.0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2.0,0.0,0.0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5.0,0.0,0.0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8.0,0.0,0.0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5.0,0.0,0.0,1


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9574 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9549 non-null float64
revol.bal            9578 non-null int64
revol.util           9516 non-null float64
inq.last.6mths       9549 non-null float64
delinq.2yrs          9549 non-null float64
pub.rec              9549 non-null float64
not.fully.paid       9578 non-null int64
dtypes: float64(9), int64(4), object(1)
memory usage: 1.0+ MB


Before applying any model to the data we took a look into it, having seen that there are some values missing. 

In [29]:
data.isna().sum()

credit.policy         0
purpose               0
int.rate              0
installment           0
log.annual.inc        4
dti                   0
fico                  0
days.with.cr.line    29
revol.bal             0
revol.util           62
inq.last.6mths       29
delinq.2yrs          29
pub.rec              29
not.fully.paid        0
dtype: int64

Given the small number of instances with values missing when compared to the total number of instances these were just removed.

In [30]:
data = data.dropna()

In [31]:
data.isna().sum()

credit.policy        0
purpose              0
int.rate             0
installment          0
log.annual.inc       0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
not.fully.paid       0
dtype: int64

Since we have one categorical feature (purpose) we transformed it into a numerical one by attributing to each of its categories a number from 0 to 6.

In [32]:
data['purpose'].unique()

array(['debt_consolidation', 'credit_card', 'all_other',
       'home_improvement', 'small_business', 'major_purchase',
       'educational'], dtype=object)

In [33]:
data['purpose'] = data.purpose.astype('category')

In [34]:
data['purpose'] = data['purpose'].cat.codes

In [14]:
data['purpose']

0       2
1       1
2       2
3       2
4       1
       ..
9573    0
9574    0
9575    2
9576    4
9577    2
Name: purpose, Length: 9516, dtype: int8

Now that data is cleaned, we can divide our dataset and begin prepare the data to be used with the models.

In [35]:
y = data['not.fully.paid']
X = data.drop('not.fully.paid', axis = 1)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [21]:
ebm = ExplainableBoostingClassifier()
ebm.fit(X_train, y_train)

ExplainableBoostingClassifier(binning_strategy='quantile', data_n_episodes=2000,
                              early_stopping_run_length=50,
                              early_stopping_tolerance=1e-05,
                              feature_names=['credit.policy', 'purpose',
                                             'int.rate', 'installment',
                                             'log.annual.inc', 'dti', 'fico',
                                             'days.with.cr.line', 'revol.bal',
                                             'revol.util', 'inq.last.6mths',
                                             'delinq.2yrs', 'pub.rec'],
                              feature_step_n_inner_bags=0,...
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'conti

In [22]:
ebm_global = ebm.explain_global()
show(ebm_global)

In [23]:
ebm_local = ebm.explain_local(X_test, y_test)
show(ebm_local)