In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

from fado.preprocessing import MetricOptimizer

## Fairness Metrics for non-binary Attributes

A discrimination measure $\psi: X \times Y \times \hat{Y} \times Z \rightarrow [0, 1]$ takes features, labels, predictions, and the protected attribute as input and outputs a score between 0 and 1.

Possible measurements are: sum over the differences of all groups or simply taking the maximum difference between all groups.

For equality of opp., predictive parity, and equalized odds, it is mandatory to implement additive smoothing or linear interpolation smoothing.

In [174]:
def nb_statistical_parity_sum_abs_difference(y: np.array, z: np.array, **kwargs):
    """
    Difference in statistical parity

    Parameters
    ----------
    y: flattened binary array
        can be the prediction or the truth label
    z: flattened array of shape y
        protected attribute. It holds integer values.
        
    positive_label: int
    privileged_group: int

    Returns
    -------

    """
    
    y = y.astype(int)
    z = z.astype(int)
    
    sum_diff = 0
    groups = list(set(z))
    for i in range(len(groups)):
        for j in range(i+1, len(groups)):
            sum_diff += np.abs(np.sum(y & (z==groups[i]))/np.sum(z==groups[i]) -
                               np.sum(y & (z==groups[j]))/np.sum(z==groups[j]))
    
    return sum_diff


def nb_statistical_parity_max_difference(y: np.array, z: np.array, **kwargs):
    """
    Difference in statistical parity

    Parameters
    ----------
    y: flattened binary array
        can be the prediction or the truth label
    z: flattened array of shape y
        protected attribute. It holds integer values.
        
    positive_label: int
    privileged_group: int

    Returns
    -------

    """
    
    y = y.astype(int)
    z = z.astype(int)
    
    max_diff = 0
    groups = list(set(z))
    for i in range(len(groups)):
        for j in range(i+1, len(groups)):
            diff = np.abs(np.sum(y & (z==groups[i]))/np.sum(z==groups[i]) -
                              np.sum(y & (z==groups[j]))/np.sum(z==groups[j]))
            max_diff = diff if diff > max_diff else max_diff
    return max_diff

In [208]:
z = np.random.randint(0, 5, size=(500,))

In [209]:
y = np.random.randint(0, 2, size=(500,))

In [177]:
nb_statistical_parity_sum_abs_difference(y, z)

0.6251618466373035

In [178]:
nb_statistical_parity_max_difference(y, z)

0.11767383634915213

In [188]:
np.histogram2d(z, y, bins=4)[0]

array([[60.,  0.,  0., 58.],
       [69.,  0.,  0., 73.],
       [75.,  0.,  0., 51.],
       [55.,  0.,  0., 59.]])

In [192]:
np.histogram2d(z, y, bins=(3, 2))[0]

array([[ 60.,  58.],
       [ 69.,  73.],
       [130., 110.]])

In [195]:
np.histogram2d(z, y, bins=(4, 2))[0]

array([[57., 48.],
       [41., 47.],
       [59., 57.],
       [96., 95.]])

In [200]:
np.histogram2d(z, y, bins=(6, 2))[0]

array([[57., 48.],
       [41., 47.],
       [ 0.,  0.],
       [59., 57.],
       [43., 47.],
       [53., 48.]])

In [211]:
np.histogram2d(z, y, bins=(5, 2))[0]

array([[57., 54.],
       [55., 50.],
       [52., 49.],
       [54., 39.],
       [39., 51.]])

In [212]:
len(set(z))

5