In [232]:
import dalex as dx
import xgboost
import shap

import sklearn

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")
import os

if not os.path.exists("plots"):
    os.mkdir("plots")

1. Load and preprocess data

In [233]:
df = pd.read_csv("cs-training.csv")
df=df.dropna()
X = df.drop(columns='SeriousDlqin2yrs')
y = df.SeriousDlqin2yrs
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0.1,Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
82524,82525,0.012623,47,0,0.143247,5500.0,7,0,1,0,2.0
112993,112994,0.422823,44,0,0.312979,15000.0,10,0,2,0,2.0
111650,111651,0.355182,36,0,0.226292,9500.0,4,0,1,0,0.0
137171,137172,0.0,54,0,0.265067,3500.0,6,0,0,0,1.0
142887,142888,0.712938,36,0,0.807205,5134.0,8,0,1,0,0.0


2. Model

In [234]:
model = xgboost.XGBClassifier(
    n_estimators=50, 
    max_depth=2, 
    use_label_encoder=False, 
    eval_metric="logloss",
    
    enable_categorical=True,
    tree_method="hist"
)

model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=True,
              eval_metric='logloss', gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=2,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=50, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [235]:
explainer = dx.Explainer(model, X_test, y_test)#, predict_function=pf_xgboost_classifier_categorical)

Preparation of a new explainer is initiated

  -> data              : 39689 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 39689 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7f8ea66b9dd0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.00478, mean = 0.0694, max = 0.967
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.864, mean = 0.000651, max = 0.994
  -> model_info        : package xgboost

A new explainer has been created!


3. Evaluate

In [236]:
explainer.model_performance()

Unnamed: 0,recall,precision,f1,accuracy,auc
XGBClassifier,0.181165,0.574032,0.27541,0.93318,0.852725


In [237]:
protected_variable = X_test.age.apply(lambda x: "pensioner" if x>70 else "adult")
privileged_group = "pensioner"

fobject = explainer.model_fairness(
    protected=protected_variable,
    privileged=privileged_group
)

Bias detection

In [238]:
fobject.fairness_check()

Bias detected in 3 metrics: TPR, FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'pensioner'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
            TPR       ACC       PPV  FPR  STP
adult  2.126437  0.953799  1.086957  5.5  6.0


Bias visualisation

In [239]:
fobject.plot(show=False).write_image(f"plots/p1.png")


#![](plots/p1.png)

In [240]:
from sklearn.tree import DecisionTreeClassifier
tree_class = DecisionTreeClassifier(random_state=0, max_depth=4, max_features=4)
tree_class.fit(X_train, y_train)
pred_test_tree = tree_class.predict(X_test)

In [241]:
explainer_dec_tr = dx.Explainer(tree_class, X_test, y_test)
explainer_dec_tr.model_performance()

Preparation of a new explainer is initiated

  -> data              : 39689 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 39689 values
  -> model_class       : sklearn.tree._classes.DecisionTreeClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7f8ea66b9dd0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0195, mean = 0.0698, max = 0.833
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.833, mean = 0.000275, max = 0.981
  -> model_info        : package sklearn

A new explainer has been created!


Unnamed: 0,recall,precision,f1,accuracy,auc
DecisionTreeClassifier,0.115744,0.564912,0.192124,0.93177,0.808415


In [242]:
fobject_dec_tr = explainer_dec_tr.model_fairness(
    protected=protected_variable,
    privileged=privileged_group
)


Bias detection

In [243]:
fobject_dec_tr.fairness_check()

Bias detected in 4 metrics: TPR, PPV, FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'pensioner'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
            TPR       ACC       PPV       FPR  STP
adult  2.458333  0.952724  1.833333  2.333333  4.0


Bias visualisation

In [244]:
fobject_dec_tr.plot(show=False).write_image(f"plots/p2.png")

Bias mitigation for the first model

In [245]:
X_train_without_prot, X_test_without_prot = X_train.drop("age", axis=1), X_test.drop("age", axis=1)

model_without_prot = xgboost.XGBClassifier(
    n_estimators=50, 
    max_depth=2, 
    use_label_encoder=False, 
    eval_metric="logloss",
    enable_categorical=True,
    tree_method="hist"
)

model_without_prot.fit(X_train_without_prot, y_train)

explainer_without_prot = dx.Explainer(
    model_without_prot, 
    X_test_without_prot, 
    y_test,
    label="XGBClassifier without the protected attribute",
    verbose=False
)

fobject_without_prot = explainer_without_prot.model_fairness(protected_variable, privileged_group)

In [256]:
from dalex.fairness import roc_pivot
from copy import copy

protected_variable_train = X_train.age.apply(lambda x: "pensioner" if x>70 else "adult")

# roc_pivot
explainer_roc_pivot = roc_pivot(
    copy(explainer_without_prot), 
    protected_variable, 
    privileged_group,
    verbose=False
)
explainer_roc_pivot.label = 'XGBClassifier with ROC pivot mitigation'
fobject_roc_pivot = explainer_roc_pivot.model_fairness(
    protected_variable, 
    privileged_group
)


In [257]:
fobject.plot(fobject_roc_pivot, show=False).\
    update_layout(autosize=False, width=800, height=450, legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99)).write_image(f"plots/p3.png")

In [259]:
pd.concat([
    explainer.model_performance().result,
    explainer_roc_pivot.model_performance().result,
    explainer_dec_tr.model_performance().result,
], axis=0)


Unnamed: 0,recall,precision,f1,accuracy,auc
XGBClassifier,0.181165,0.574032,0.27541,0.93318,0.852725
XGBClassifier with ROC pivot mitigation,0.216032,0.554428,0.310916,0.932878,0.850983
DecisionTreeClassifier,0.115744,0.564912,0.192124,0.93177,0.808415
