# Evaluating Fairness in Machine Learning: Comparative Analysis and Benchmarking of Fairlearn and AIF360 | German Credit Dataset

In [1]:
import os
import random

import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

from src.constants import RANDOM_STATE, DEFAULT_MODEL_CONFIG

In [2]:
sklearn.set_config(transform_output="pandas")

## Ensure reproducibility

Set random seeds for reproducibility.

In [3]:
np.random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
random.seed(RANDOM_STATE)

## Load data

In [4]:
TARGET = "Creditability"
PROTECTED_ATTRIBUTE = "Sex & Marital Status"

In [5]:
data = pd.read_csv("../data/german_credit_data.csv")

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Creditability                      1000 non-null   int64
 1   Account Balance                    1000 non-null   int64
 2   Duration of Credit (month)         1000 non-null   int64
 3   Payment Status of Previous Credit  1000 non-null   int64
 4   Purpose                            1000 non-null   int64
 5   Credit Amount                      1000 non-null   int64
 6   Value Savings/Stocks               1000 non-null   int64
 7   Length of current employment       1000 non-null   int64
 8   Instalment per cent                1000 non-null   int64
 9   Sex & Marital Status               1000 non-null   int64
 10  Guarantors                         1000 non-null   int64
 11  Duration in Current address        1000 non-null   int64
 12  Most valuable availab

In [7]:
data[TARGET].value_counts()

Creditability
1    700
0    300
Name: count, dtype: int64

In [8]:
data[PROTECTED_ATTRIBUTE].value_counts()

Sex & Marital Status
3    548
2    310
4     92
1     50
Name: count, dtype: int64

In [9]:
y = data.loc[:, TARGET]
z = data.loc[:, PROTECTED_ATTRIBUTE]
X = data.drop(columns=TARGET)

In [10]:
X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(X, y, z, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

In [11]:
MODEL_CONFIG = dict(DEFAULT_MODEL_CONFIG, cat_features=X.select_dtypes("object").columns.to_list())

In [12]:
model = CatBoostClassifier(**MODEL_CONFIG)

In [13]:
model.fit(X_train, y_train)

0:	learn: 0.6899835	total: 150ms	remaining: 7m 30s
250:	learn: 0.4704244	total: 542ms	remaining: 5.93s
500:	learn: 0.4235692	total: 929ms	remaining: 4.63s
750:	learn: 0.3888618	total: 1.36s	remaining: 4.07s
1000:	learn: 0.3559247	total: 1.75s	remaining: 3.5s
1250:	learn: 0.3269348	total: 2.13s	remaining: 2.99s
1500:	learn: 0.3030500	total: 2.71s	remaining: 2.7s
1750:	learn: 0.2826410	total: 3.19s	remaining: 2.27s
2000:	learn: 0.2646425	total: 3.7s	remaining: 1.85s
2250:	learn: 0.2486124	total: 4.1s	remaining: 1.36s
2500:	learn: 0.2337442	total: 4.56s	remaining: 910ms
2750:	learn: 0.2201144	total: 5.01s	remaining: 454ms
2999:	learn: 0.2080692	total: 5.46s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x20a3809e750>

In [44]:
y_pred = pd.Series(model.predict(X_test), name="y_pred", index=y_test.index)

## Fairness assessment

### Fairlearn

#### Detection

In [15]:
from fairlearn.metrics import demographic_parity_ratio, demographic_parity_difference, equalized_odds_ratio, equalized_odds_difference

In [16]:
demographic_parity_difference(y_test, y_pred, sensitive_features=z_test, method="between_groups")

0.30952380952380953

In [17]:
demographic_parity_difference(y_test, y_pred, sensitive_features=z_test, method="to_overall")

0.235

In [18]:
demographic_parity_ratio(y_test, y_pred, sensitive_features=z_test)

0.6176470588235294

#### Mitigation

In [19]:
from fairlearn.reductions import DemographicParity

In [45]:
dp = DemographicParity(difference_bound=0.01)
dp.load_data(X_test.to_numpy(), y_test.to_numpy(), sensitive_features=z_test.to_numpy()) # pandas type not working for some reason

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  self.pos_basis[i]["+", e, g] = 1
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series

In [47]:
dp.gamma(lambda X_test: y_pred.to_numpy())

sign  event  group_id
+     all    1          -0.235000
             2          -0.082222
             3           0.074524
             4          -0.029118
-     all    1           0.235000
             2           0.082222
             3          -0.074524
             4           0.029118
dtype: float64

### AIF360

In [24]:
import aif360
from aif360.datasets import BinaryLabelDataset

In [25]:
dataset_aif360 = BinaryLabelDataset(df=pd.concat([X_test, y_test], axis=1), 
                                    label_names=[TARGET],
                                    protected_attribute_names=[PROTECTED_ATTRIBUTE],
                                    )

predictions_aif360 = BinaryLabelDataset(df=X_test.assign(**{TARGET: y_pred}), 
                                        label_names=[TARGET],
                                        protected_attribute_names=[PROTECTED_ATTRIBUTE],
                                        )

#### Detection

_"Since the main computation of confusion matrices is common for a large set of metrics, we utilize memoization and caching of computations for performance on large-scale datasets."_

In [26]:
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.explainers import MetricTextExplainer

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[inFairness]'


In [34]:
priv = [{PROTECTED_ATTRIBUTE: 1}, {PROTECTED_ATTRIBUTE: 3}, {PROTECTED_ATTRIBUTE: 4}]
unpriv = [{PROTECTED_ATTRIBUTE: 2}]

In [38]:
cm = ClassificationMetric(dataset_aif360,
                         predictions_aif360,
                         privileged_groups=priv,
                         unprivileged_groups=unpriv,
                         )

In [39]:
cm.disparate_impact()

0.8355555555555556

In [40]:
cm.statistical_parity_difference()

-0.1284722222222222

In [60]:
text_expl = MetricTextExplainer(cm)

In [61]:
text_expl.statistical_parity_difference()

'Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.1284722222222222'

In [53]:
from aif360.sklearn.metrics import statistical_parity_difference, disparate_impact_ratio

In [120]:
z_test.eq(1).sum()

6

In [130]:
statistical_parity_difference(y_test, y_pred, prot_attr=z_test, priv_group=3)

-0.15689223057644108

In [131]:
disparate_impact_ratio(y_test, y_pred, prot_attr=z_test, priv_group=3)

0.8061919504643963

___

In [135]:
pd.concat([y_test, y_pred], axis=1).groupby(z_test, sort=False).sum().div(z_test.value_counts(), axis=0)

Unnamed: 0_level_0,Creditability,y_pred
Sex & Marital Status,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.666667,0.5
2,0.625,0.652778
3,0.761905,0.809524
4,0.647059,0.705882


In [136]:
pd.concat([y_test, y_pred], axis=1).mean()

Creditability    0.700
y_pred           0.735
dtype: float64

In [137]:
y_pred.loc[z_test.ne(3)].mean() - y_pred.loc[z_test.eq(3)].mean() 

-0.15689223057644108

#### Mitigation