In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/heloc_dataset_v1.csv')

# Zakodowanie zmiennej celu jako 1/0

In [3]:
data['RiskPerformance'] = np.where(data.RiskPerformance=='Bad',1,0)

In [4]:
data.head(2)

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,1,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,1,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0


In [5]:
data.shape

(10459, 24)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score

In [7]:
from sklearn.feature_selection import SelectKBest

# Podział na train/test

In [8]:
X_train, X_test, y_train, y_test = \
train_test_split(data.drop('RiskPerformance', axis =1), data.RiskPerformance, test_size=.33, random_state=42)

# Feature selection na podstawie ANOVA

In [9]:
seletor_f_classif = SelectKBest(k='all')
seletor_f_classif.fit_transform(X_train, y_train)

array([[ 70, 153,   7, ...,   1,   1,  89],
       [ 64, 103,   1, ...,   3,   2,  58],
       [ 83, 210,   3, ...,   2,   0,  40],
       ...,
       [ 87, 230,   1, ...,   2,   0,  42],
       [ 68, 132,   4, ...,   2,   0,  88],
       [ 70, 130,   5, ...,   1,   0,  75]])

In [10]:
cols_f_classif = pd.DataFrame({
    'column':X_train.columns, 
    'score': seletor_f_classif.scores_, 
    'p_value':seletor_f_classif.pvalues_
}).sort_values('score', ascending = False)

In [11]:
column_list = cols_f_classif.query("score>10").column.values

# Podział na train/test wybranych kolumn

In [12]:
X_train, X_test, y_train, y_test = \
train_test_split(data[column_list], data.RiskPerformance, test_size=.33, random_state=42)

# Model

In [13]:
clf_win = AdaBoostClassifier(
    base_estimator = DecisionTreeClassifier(
        max_depth=1,
        random_state=123
    ),
    n_estimators=150,
    learning_rate=0.27,
    algorithm='SAMME.R',
    random_state=123
)
clf_win.fit(X_train, y_train)
y_pred = clf_win.predict_proba(X_test)[:,1]
y_pred2 = clf_win.predict(X_test)
print(" AUC = ",round(roc_auc_score( y_test, y_pred)*100,2),
      " ACC = ", round(accuracy_score( y_test, y_pred2)*100,2), 
      " F1 = ", round(f1_score( y_test, y_pred2)*100,2), 
      " Precision = ", round(precision_score( y_test, y_pred2)*100,2), 
      " Recall = ", round(recall_score( y_test, y_pred2)*100,2))

 AUC =  78.97  ACC =  71.61  F1 =  73.91  Precision =  71.66  Recall =  76.31


# Wyjaśnienie

In [14]:
selected_obs = X_test.loc[:7528]
pred = clf_win.predict_proba(selected_obs)
pred

array([[0.49981314, 0.50018686]])

In [17]:
from dalex import explainer

In [58]:
exp = explainer.Explainer(model = clf_win, data = X_test, y =y_pred, model_type= 'classification')

Preparation of a new explainer is initiated

  -> label             : not specified, model's class taken instead!
  -> data              : 3452 rows 19 cols
  -> target variable   : 3452 values
  -> predict function  : <function yhat.<locals>.<lambda> at 0x7f2fc6606ef0> will be used
  -> predicted values  : min = 0.4236225844739515, mean = 0.5006859230698417, max = 0.5778999655989345
  -> residual function : difference between y and yhat
  -> residuals         : min = -0.5017165492954616, mean = -0.2317242977383509, max = 0.48890482472679453
  -> model_info        : package sklearn

A new explainer has been created!


In [37]:
break_down = exp.predict_parts(selected_obs, type='break_down')
break_down.plot(max_vars=4)

In [35]:
selected_obs2 = X_test.loc[2473:2473]

In [43]:
break_down2 = exp.predict_parts(selected_obs2, type='break_down')
break_down2.plot(max_vars=10)

In [46]:
selected_obs3 = X_test.loc[5440:5440]
break_down3 = exp.predict_parts(selected_obs3, type='break_down')
break_down3.plot(max_vars=15)

# Comment

Poniżej widać, że dla dwóch obserwacji widzimy zupełnie różną "ważność" zmiennych.

In [60]:
break_down.plot(max_vars=4)
break_down3.plot(max_vars=4)

Dla dwoch obserwacji mozemy zauwazyc, że zmienna `PercentTradesNeverDelq` ma różny wpływ w zależności od obserwacji

In [61]:
break_down2.plot(max_vars=10)
break_down3.plot(max_vars=14)

## Model Neural net

In [47]:
from sklearn.neural_network import MLPClassifier

In [48]:
clf = MLPClassifier(alpha=1, max_iter=1000)

In [49]:
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=1, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=1000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [52]:
y_pred_nn = clf.predict_proba(X_test)

### Explain

In [55]:
exp_nn = explainer.Explainer(model = clf, data = X_test, y =y_pred_nn[:,1], model_type= 'classification')

Preparation of a new explainer is initiated

  -> label             : not specified, model's class taken instead!
  -> data              : 3452 rows 19 cols
  -> target variable   : 3452 values
  -> predict function  : <function yhat.<locals>.<lambda> at 0x7f2fc62b3cb0> will be used
  -> predicted values  : min = 0.0026217937051852603, mean = 0.2689616253314909, max = 0.9982424853352497
  -> residual function : difference between y and yhat
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!


Nowy model

In [56]:
break_down_nn = exp_nn.predict_parts(selected_obs, type='break_down')
break_down_nn.plot(max_vars=4)

Stary model

In [59]:
break_down = exp.predict_parts(selected_obs, type='break_down')
break_down.plot(max_vars=4)

Roznice moga byc spowodowane tym, ze modele te są zupelnie inaczej budowane i oba modele nie maja wysokiej mocy predykcyjnej, wiec biora pod uwage inne zmienne.