In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
data = fetch_openml(data_id=1590, as_frame=True)
X = pd.get_dummies(data.data)
y_true = (data.target == '>50K') * 1
sex = data.data[['sex', 'race']]
sex.value_counts()

sex     race              
Male    White                 28735
Female  White                 13027
Male    Black                  2377
Female  Black                  2308
Male    Asian-Pac-Islander     1002
Female  Asian-Pac-Islander      517
Male    Amer-Indian-Eskimo      285
        Other                   251
Female  Amer-Indian-Eskimo      185
        Other                   155
dtype: int64

In [2]:
from fairlearn.metrics import group_summary
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=4)
classifier.fit(X, y_true)

y_pred = classifier.predict(X)
#group_summary(accuracy_score, y_true, y_pred, sensitive_features=sex)

In [3]:
from fairlearn.metrics import selection_rate_group_summary
#selection_rate_group_summary(y_true, y_pred, sensitive_features=sex)

In [4]:
from fairlearn.widget import FairlearnDashboard
FairlearnDashboard(sensitive_features=sex,
                   sensitive_feature_names=['sex', 'race'],
                   y_true=y_true,
                   y_pred={"initial model": y_pred})

FairlearnWidget(value={'true_y': [0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1…

<fairlearn.widget._fairlearn_dashboard.FairlearnDashboard at 0x7f70e02ec820>

Can we find intersectional discrimination with Fairlearn?

In [5]:
import numpy as np

In [6]:
X = pd.DataFrame(np.random.randint(0, high=2, size=(100, 3), dtype='l'), columns=['sex', 'race', 'Y'])

In [7]:
X['cnt'] = 1

In [8]:
counts = X.groupby(['sex', 'race']).Y.count()

In [9]:

f = lambda x: [np.random.choice([0,1], 17, p=[0.65, 0.35])[0] for _ in range(x)]

In [10]:
X.at[(X['sex'] == 1) & (X['race'] == 1),'result'] = f(counts.loc[1,1])

In [11]:
X.groupby(['sex', 'race']).agg({'result':'sum', 'Y':['sum', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,result,Y,Y
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,sum,count
sex,race,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0,0.0,10,25
0,1,0.0,19,31
1,0,0.0,8,24
1,1,8.0,9,20


In [12]:
# now let's create a biased scoring function

Idea: first sample from the biased distribution p_bias, then calculate the expectancy value of the unbiased distribution p_0 and caluculate how much you need to bias p_0 to get the exectancy of value of the unbiased distribution p_0 -> p_correction

In [13]:
X[(X[['sex', 'race']] == 1).all(1)].shape

(20, 5)

In [14]:
X.groupby(['sex', 'race']).agg({'result':'sum', 'Y':['sum', 'count']}).loc[[1]*len()]

TypeError: len() takes exactly one argument (0 given)

In [15]:
a = tuple([1 for _ in range(len(counts.index.levels))])

In [16]:
a

(1, 1)

In [17]:
counts.loc[a]

20

In [47]:
def biased_score(df, sensitive_cols, biased_prob):
    #todo make this agnostic of specific columns
    counts = df.groupby(sensitive_cols).agg({sensitive_cols[0]:'sum'})
    indexer = tuple([1 for _ in range(len(counts.index.levels))])
    df[(df[sensitive_cols] == 1).all(axis=1)]['result'] = np.random.choice([0,1], counts.loc[indexer].values, p=[biased_prob, 1-biased_prob])
    return df

In [35]:
type(counts)

pandas.core.series.Series

In [48]:
biased_score(X, ['sex', 'race'], 0.3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[(df[sensitive_cols] == 1).all(axis=1)]['result'] = np.random.choice([0,1], counts.loc[indexer].values, p=[biased_prob, 1-biased_prob])


Unnamed: 0,sex,race,Y,cnt,result
0,0,1,1,1,
1,1,1,0,1,1.0
2,0,1,1,1,
3,1,0,1,1,
4,0,0,0,1,
...,...,...,...,...,...
95,0,1,1,1,
96,0,1,0,1,
97,1,0,0,1,
98,1,1,1,1,1.0


In [20]:
def shift_prop(counts, expected_distribution):
    expected_values = counts.sum() * expected_distribution
    

In [25]:
counts.sum()

100

In [21]:
counts

sex  race
0    0       25
     1       31
1    0       24
     1       20
Name: Y, dtype: int64

In [22]:
counts.loc[1,:].sum()

44

In [59]:
i = 1000003054

In [60]:
i

1000003054

In [53]:
i = i + 1

In [55]:
a = i * 3

In [68]:
i = 2
i

2

In [69]:
i == 2

True

In [70]:
type(i)

int

In [76]:
type("adfaserer")

str

In [77]:
"1" == 1

False