In [None]:
import dalex as dx
import numpy as np
import pandas as pd
import sklearn
from sklearn.tree import DecisionTreeRegressor
import xgboost
from sklearn.metrics import mean_squared_error
brain_stroke = pd.read_csv('brain_stroke.csv')

In [None]:
def preprocess_dataset(old_dataset):
  dataset = old_dataset.copy()

  columns = dataset.columns[:-1]
  for col in columns:
    if not dataset[col].astype(str).str.isnumeric().all():
      values = dataset[col].unique()
      if len(values) <= 1:
        dataset.drop(col, axis=1, inplace=True)
      elif len(values) == 2:
        val1 = 'yes' if 'yes' in values else values[0]
        dataset[col] = np.where(dataset[col] == val1, 1, 0)
      else:
        dummies = pd.get_dummies(dataset[[col]], prefix=col)
        dataset.drop(col, axis=1, inplace=True)
        dataset = pd.concat([dataset, dummies], axis=1)
  return dataset.drop(columns=['stroke']), dataset['stroke'].astype(np.float64)


X, y = preprocess_dataset(brain_stroke)

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
model = xgboost.XGBClassifier()

params = {
    "max_depth": 5,
    "objective": "binary:logistic",
    "eval_metric": "auc"
}

model.fit(X_train, y_train)

In [None]:
pred_test = model.predict(X_test)
[print(f'Ground truth is {y_test.iloc[i]}, prediction is {pred_test[i]}') for i in range(3)]

Ground truth is 0.0, prediction is 0.0
Ground truth is 0.0, prediction is 0.0
Ground truth is 0.0, prediction is 0.0


[None, None, None]

In [None]:
def pf_xgboost_classifier_categorical(model, df):
    df.loc[:, df.dtypes == 'object'] =\
        df.select_dtypes(['object'])\
        .apply(lambda x: x.astype('category'))
    return model.predict_proba(df)[:, 1]

exp = dx.Explainer(model, X_test, y_test, predict_function=pf_xgboost_classifier_categorical)

Preparation of a new explainer is initiated

  -> data              : 1495 rows 16 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 1495 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function pf_xgboost_classifier_categorical at 0x7fe1c615d050> will be used
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.000494, mean = 0.0454, max = 0.437
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.437, mean = 0.00682, max = 0.999
  -> model_info        : package xgboost

A new explainer has been created!


In [None]:
exp.model_performance()


invalid value encountered in long_scalars



Unnamed: 0,recall,precision,f1,accuracy,auc
XGBClassifier,0.0,,,0.947826,0.832243


In [None]:
exp.model_parts().result

Unnamed: 0,variable,dropout_loss,label
0,bmi,0.167501,XGBClassifier
1,smoking_status_smokes,0.168277,XGBClassifier
2,gender,0.168662,XGBClassifier
3,heart_disease,0.168963,XGBClassifier
4,work_type_children,0.168963,XGBClassifier
5,_full_model_,0.168963,XGBClassifier
6,work_type_Govt_job,0.169047,XGBClassifier
7,Residence_type,0.169202,XGBClassifier
8,smoking_status_Unknown,0.169365,XGBClassifier
9,work_type_Self-employed,0.169372,XGBClassifier


In [None]:
for i in range(3):
  cp = exp.predict_profile(new_observation=X_test.iloc[[i]])
  cp.plot(variables=["smoking_status_smokes", "age"])

In [None]:
def profile(count):
  cp = exp.predict_profile(new_observation=X_test.iloc[[count]])
  print(f'CP profiles for {i}-th trial')
  cp.plot(variables=["hypertension"])
for i in range(5):
  profile(i)

In [None]:
for i in [0,3]:
  cp = exp.predict_profile(new_observation=X_test.iloc[[i]])
  cp.plot(variables=["hypertension"])

In [None]:
pdp = exp.model_profile()

Calculating ceteris paribus: 100%|██████████| 16/16 [00:01<00:00, 11.14it/s]


In [None]:
pdp.result

In [None]:
pdp.plot(variables=["hypertension"])
pdp.plot(variables=["age", "bmi"])

Decision Tree Regressor

In [None]:
tree_reg = DecisionTreeRegressor(random_state=0, max_depth=4, max_features=4)
tree_reg.fit(X_train, y_train)
pred_test_tree = tree_reg.predict(X_test)

In [None]:
exp_2 = dx.Explainer(tree_reg, X_test, y_test, predict_function=pf_xgboost_classifier_categorical)

In [None]:
pdp = exp.model_profile()
pdp.result

pdp.plot(variables=["hypertension"])
pdp.plot(variables=["age", "bmi"])