In [1]:
# Needed to specify settings for the used heuristic
from functools import partial
# Dataset
from fairdo.utils.dataset import load_data
# Metric to optimize on
from fairdo.metrics import statistical_parity_abs_diff
# Load GA and the wrapper for pre-processing data
from fairdo.optimize.geneticalgorithm import genetic_algorithm
from fairdo.preprocessing import HeuristicWrapper

## 1. Load Dataset

When using `load_data`, an identifier is passed as a string to return the yielded dataset. This function returns a triple which consists of a dataframe, the label as a string, and a list of protected attributes. Both label and all protected attributes are columns of the returned dataframe.

In [2]:
df, label, protected_attributes = load_data('compas')

Data downloaded.
['Other' 'African-American' 'Caucasian' 'Hispanic' 'Native American'
 'Asian']
African-American    3696
Caucasian           2454
Hispanic             637
Other                377
Asian                 32
Native American       18
Name: race, dtype: int64
(7214, 8)


In [3]:
# Declare certain settings for the genetic algorithm
# It is also possible to use different genetic operators.
heuristic = partial(genetic_algorithm,
                    pop_size=100,
                    num_generations=500)
disc_measure = statistical_parity_abs_diff
preproc_heuristics = HeuristicWrapper(heuristic=heuristic,
                                      disc_measure=disc_measure,
                                      protected_attribute=protected_attributes,
                                      label=label)

After initializing `HeuristicWrapper`, it is possible to use `fit` and `transform` on any dataset without reinitializing.

In [4]:
# Create pre-processing instance
preproc_heuristics.fit(df)
# Remove samples to yield a fair dataset
df_fair = preproc_heuristics.transform()

Stopping after 62 generations due to lack of improvement.


## 2. Compare Discrimination

In [5]:
y_fair = df_fair[label]
z_fair = df_fair[protected_attributes].to_numpy().flatten() # expects 1d array
y = df[label]
z = df[protected_attributes].to_numpy().flatten() # expects 1d array

In [6]:
discrimination_fair = statistical_parity_abs_diff(y=y_fair, z=z_fair)
discrimination_orig = statistical_parity_abs_diff(y=y, z=z)

In [7]:
print(f'The original dataset has a statistical disparity (absolute) value of: {discrimination_orig}')
print(f'The pre-processed fair dataset has a statistical disparity (absolute) value of: {discrimination_fair}')
print('(Lower is better.)')

The original dataset has a statistical disparity (absolute) value of: 1.8856276310662383
The pre-processed fair dataset has a statistical disparity (absolute) value of: 0.6200204385646837
(Lower is better.)
