In [1]:
# Needed to specify settings for the used heuristic
from functools import partial
# Dataset
from fairdo.utils.dataset import load_data
# Metric to optimize on
from fairdo.metrics import statistical_parity_abs_diff
# Load GA and the wrapper for pre-processing data
from fairdo.optimize.geneticalgorithm import genetic_algorithm
from fairdo.preprocessing import HeuristicWrapper

## 1. Load Dataset

When using `load_data`, an identifier is passed as a string to return a dataset. This function returns a triple which consists of a `pandas.DataFrame`, the label as a string, and protected attributes as a list of strings. Both label and all protected attributes are columns of the returned `pandas.DataFrame`.

In [2]:
df, label, protected_attributes = load_data('compas')

Data downloaded.
[5 0 2 3 4 1]
0    3696
2    2454
3     637
5     377
1      32
4      18
Name: race, dtype: int64
(7214, 8)


In [3]:
# Declare certain settings for the genetic algorithm
# It is also possible to use different genetic operators.
heuristic = partial(genetic_algorithm,
                    pop_size=100,
                    num_generations=500)
# Following discrimination measure sums all absolute differences between the group disparities
disc_measure = statistical_parity_abs_diff
# Initialize HeuristicWrapper
preprocessor = HeuristicWrapper(heuristic=heuristic,
                                disc_measure=disc_measure,
                                protected_attribute=protected_attributes,
                                label=label)

After initializing `HeuristicWrapper`, it is possible to use `fit` and `transform` on any dataset without reinitializing.

In [4]:
# Create pre-processing instance
preprocessor.fit(df)
# Remove samples to yield a fair dataset
df_fair = preprocessor.transform()

Stopping after 215 generations due to lack of improvement.


## 2. Compare Discrimination

In [5]:
y_fair = df_fair[label]
z_fair = df_fair[protected_attributes].to_numpy().flatten() # expects 1d array
y = df[label]
z = df[protected_attributes].to_numpy().flatten() # expects 1d array

In [6]:
discrimination_fair = statistical_parity_abs_diff(y=y_fair, z=z_fair)
discrimination_orig = statistical_parity_abs_diff(y=y, z=z)

In [8]:
print(f'The original dataset has a statistical disparity (absolute) value of: {discrimination_orig:.2f}')
print(f'The pre-processed fair dataset has a statistical disparity (absolute) value of: {discrimination_fair:.2f}')
print('(Lower is better.)')

The original dataset has a statistical disparity (absolute) value of: 1.89
The pre-processed fair dataset has a statistical disparity (absolute) value of: 0.16
(Lower is better.)
