In [60]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from dalex import Explainer
from dalex.fairness import roc_pivot
import numpy as np

In [61]:
df = pd.read_csv('cs-training.csv')
df = df.iloc[:, 1:]
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [62]:
df = df.dropna()
X = df.iloc[:100000, 1:]
y = df.iloc[:100000, 0]
X_test = df.iloc[100000:, 1:]
y_test = df.iloc[100000:, 0]
model = RandomForestClassifier().fit(X, y)

In [63]:
def predict_fn(model, X):
    return model.predict_proba(X)[:, 1]

forest_exp = Explainer(model, X_test, y_test, predict_function=predict_fn, verbose=False)


X does not have valid feature names, but RandomForestClassifier was fitted with feature names



In [64]:
forest_exp.model_performance().result

Unnamed: 0,recall,precision,f1,accuracy,auc
RandomForestClassifier,0.179505,0.524096,0.267418,0.929449,0.828574


In [65]:
protected = np.where(X_test.age < 60, 'young', 'old')
privileged = 'young'

In [66]:
forest_fobject = forest_exp.model_fairness(protected = protected, privileged=privileged, label='forest_base')
forest_fobject.fairness_check()
forest_fobject.plot()

Bias detected in 3 metrics: TPR, FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'young'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
          TPR       ACC       PPV   FPR       STP
old  0.783784  1.056893  1.046065  0.25  0.290323


In [67]:
lr_model = LogisticRegression(max_iter=1000).fit(X, y)

exp = Explainer(lr_model, X_test, y_test, predict_function=predict_fn, verbose=False)
exp.model_performance().result


X does not have valid feature names, but LogisticRegression was fitted with feature names



Unnamed: 0,recall,precision,f1,accuracy,auc
LogisticRegression,0.044017,0.496124,0.080859,0.928216,0.687267


In [68]:
fobject = exp.model_fairness(protected = protected, privileged=privileged)
fobject.fairness_check()
fobject.plot()

Bias detected in 3 metrics: TPR, PPV, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'young'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
          TPR       ACC    PPV  FPR       STP
old  0.098039  1.056955  0.666  NaN  0.111111

Take into consideration that NaN's are present, consider checking 'metric_scores' plot to see the difference


In [69]:
roc_exp = roc_pivot(forest_exp, protected=protected, privileged=privileged)
roc_fobject = roc_exp.model_fairness(protected=protected, privileged=privileged, label='forest_roc')
roc_fobject.fairness_check()
roc_fobject.plot([forest_fobject])

protected array is not string type, converting to string 
Bias detected in 3 metrics: TPR, FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'young'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
          TPR       ACC       PPV       FPR   STP
old  1.278146  1.055799  0.926829  0.538462  0.56


In [71]:
roc_fobject.plot([forest_fobject], type='metric_scores')

In [70]:
roc_exp.model_performance().result

Unnamed: 0,recall,precision,f1,accuracy,auc
RandomForestClassifier,0.156809,0.525346,0.241525,0.92935,0.828502
