In [39]:
import pandas as pd, numpy as np

import dalex as dx # version 0.2.0

from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout

import warnings
warnings.filterwarnings('ignore')
from keras.wrappers.scikit_learn import KerasClassifier

In [40]:
df = pd.read_csv('diabetes.csv')

In [41]:
df.loc[(df["Glucose"]==0) & (df["Outcome"]==0), "Glucose"] = np.nan
df.loc[(df["Glucose"]==0) & (df["Outcome"]==1), "Glucose"] = np.nan
df = df.dropna()
df.loc[(df["BloodPressure"]==0) & (df["Outcome"]==0), "BloodPressure"] = np.nan
df.loc[(df["BloodPressure"]==0) & (df["Outcome"]==1), "BloodPressure"] = np.nan
df = df.dropna()
df = df.drop(columns='SkinThickness')
df = df.drop(columns='Insulin')
df.loc[(df["BMI"]==0) & (df["Outcome"]==0), "BMI"] = np.nan
df.loc[(df["BMI"]==0) & (df["Outcome"]==1), "BMI"] = np.nan
df = df.dropna()

In [42]:
X = df.drop(columns='Outcome')
y = df.Outcome

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [44]:
numerical_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age']
numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

categorical_features = []
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [45]:
mlpclassifier = MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=500, random_state=0)

In [46]:
#適当にモデル作成
def create_model():
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=6, kernel_initializer='glorot_normal'))
    model.add(Dropout(0.1))
    model.add(Dense(16, activation='relu', input_dim=32, kernel_initializer='glorot_normal'))
    model.add(Dropout(0.1))
    model.add(Dense(8, activation='relu', input_dim=16, kernel_initializer='glorot_normal'))
    model.add(Dense(4, activation='relu', input_dim=8, kernel_initializer='glorot_normal'))
    model.add(Dense(1, activation='sigmoid', input_dim=4, kernel_initializer='glorot_normal'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics =['accuracy'])
    return model

# kerasclassifierを使う(predict_probaによる予測を2列で返すため)
kerasclassifier = KerasClassifier(build_fn=create_model, epochs=200, nb_epoch=20, batch_size = 32,
                          verbose=False)

In [47]:
lgbmclassifier = LGBMClassifier(objective='binary', num_leaves = 8,
                     num_iteration = 40, n_estimetors = 701, 
                     min_data_in_leaf = 6, min_child_weight = 0.01,
                     learning_rate = 0.1)

In [48]:
clf_mlp = Pipeline(steps=[
                      ('preprocessor', preprocessor),
                      ('classifier', mlpclassifier)
])                      

clf_keras = Pipeline(steps=[
                      ('preprocessor', preprocessor),
                      ('classifier', kerasclassifier)
])

clf_lgbm = Pipeline(steps=[
                      ('preprocessor', preprocessor),
                      ('classifier', lgbmclassifier)
])

In [49]:
clf_mlp.fit(X_train, y_train)
clf_keras.fit(X_train, y_train)
clf_lgbm.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Pregnancies', 'Glucose',
                                                   'BloodPressure', 'BMI',
                                                   'DiabetesPedigreeFunction',
                                                   'Age']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                    

In [50]:
exp_mlp = dx.Explainer(clf_mlp, X_test, y_test, label = "PimaIndians MLP Pipeline")
exp_keras = dx.Explainer(clf_keras, X_test, y_test, label = "PimaIndians Keras Pipeline")
exp_lgbm = dx.Explainer(clf_lgbm, X_test, y_test, label = "PimaIndians LightGBM Pipeline")

Preparation of a new explainer is initiated

  -> data              : 145 rows 6 cols
  -> target variable   : Argument 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 145 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : PimaIndians MLP Pipeline
  -> predict function  : <function yhat_proba_default at 0x7f628bbaa3b0> will be used (default)
  -> predicted values  : min = 3.05e-13, mean = 0.369, max = 1.0
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1.0, mean = -0.0244, max = 1.0
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 145 rows 6 cols
  -> target variable   : Argument 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 145 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : PimaIndians Kera

In [51]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age
129,0,105.0,84.0,27.9,0.741,62
160,4,151.0,90.0,29.7,0.294,36
362,5,103.0,108.0,39.2,0.305,65
636,5,104.0,74.0,28.8,0.153,48
76,7,62.0,78.0,32.6,0.391,41
...,...,...,...,...,...,...
660,10,162.0,84.0,27.7,0.182,54
194,8,85.0,55.0,24.4,0.136,42
135,2,125.0,60.0,33.8,0.088,31
624,2,108.0,64.0,30.8,0.158,21


In [52]:
rks =  pd.DataFrame({'Pregnancies' : 6,
         'Glucose' : 148,
         'BloodPressure' : 72,
         'BMI': 33.6,
         'DiabetesPedigreeFunction' : 0.627, 
         'Age' : 50},
                   index=['rks'])
rks

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age
rks,6,148,72,33.6,0.627,50


In [53]:
exp_mlp = dx.Explainer(clf_mlp, X_test, y_test, label = "PimaIndians MLP Pipeline")

Preparation of a new explainer is initiated

  -> data              : 145 rows 6 cols
  -> target variable   : Argument 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 145 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : PimaIndians MLP Pipeline
  -> predict function  : <function yhat_proba_default at 0x7f628bbaa3b0> will be used (default)
  -> predicted values  : min = 3.05e-13, mean = 0.369, max = 1.0
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1.0, mean = -0.0244, max = 1.0
  -> model_info        : package sklearn

A new explainer has been created!


In [55]:
exp_rks = exp_mlp.predict_profile(rks)
exp_rks.plot(variables = ['Pregnancies'])

Calculating ceteris paribus: 100%|██████████| 6/6 [00:00<00:00, 53.29it/s]
