In [7]:
import numpy as np
import pandas as pd
np.random.seed(0)

from aif360.datasets import StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.algorithms.preprocessing import Reweighing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from IPython.display import Markdown, display
from common_utils import compute_metrics

In [8]:
df = pd.read_csv('compas-scores-original.csv')
df.head(5)

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,vr_offense_date,vr_charge_desc,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,type_of_assessment,decile_score,score_text,screening_date
0,1,miguel hernandez,miguel,hernandez,8/14/2013,0,4/18/1947,69,Greater than 45,0,...,,,Risk of Violence,1,Low,8/14/2013,Risk of Recidivism,1,Low,8/14/2013
1,2,michael ryan,michael,ryan,12/31/2014,0,2/6/1985,31,25 - 45,0,...,,,Risk of Violence,2,Low,12/31/2014,Risk of Recidivism,5,Medium,12/31/2014
2,3,kevon dixon,kevon,dixon,1/27/2013,0,1/22/1982,34,25 - 45,1,...,7/5/2013,Felony Battery (Dom Strang),Risk of Violence,1,Low,1/27/2013,Risk of Recidivism,3,Low,1/27/2013
3,4,ed philo,ed,philo,4/14/2013,0,5/14/1991,24,Less than 25,1,...,,,Risk of Violence,3,Low,4/14/2013,Risk of Recidivism,4,Low,4/14/2013
4,5,marcu brown,marcu,brown,1/13/2013,0,1/21/1993,23,Less than 25,1,...,,,Risk of Violence,6,Medium,1/13/2013,Risk of Recidivism,8,High,1/13/2013


In [9]:
dataset_orig = StandardDataset(df, 
                               label_name='decile_score', 
                               favorable_classes=[1,2,3,4],
                               protected_attribute_names=['race'], 
                               privileged_classes=[[0]],
                               features_to_keep=['sex']
                              )
privileged_groups = [{'race': 0}]
unprivileged_groups = [{'race': 1}]
print(dataset_orig)

               instance weights features                     labels
                                         protected attribute       
                                     sex                race       
instance names                                                     
0                           1.0      0.0                 0.0    1.0
1                           1.0      0.0                 0.0    0.0
2                           1.0      0.0                 1.0    1.0
3                           1.0      0.0                 1.0    1.0
4                           1.0      0.0                 1.0    0.0
...                         ...      ...                 ...    ...
11752                       1.0      0.0                 0.0    1.0
11753                       1.0      0.0                 0.0    0.0
11754                       1.0      0.0                 0.0    1.0
11755                       1.0      0.0                 0.0    1.0
11756                       1.0      0.0        

In [10]:
metric_orig_train = BinaryLabelDatasetMetric(dataset_orig, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
display(Markdown("#### Original dataset"))
print("Disparate Imapct of unprivileged and privileged groups = %f" % metric_orig_train.disparate_impact())
print("SPD of unprivileged and privileged groups = %f" % metric_orig_train.statistical_parity_difference())

#### Original dataset

Disparate Imapct of unprivileged and privileged groups = 0.614066
SPD of unprivileged and privileged groups = -0.268024


In [11]:
RW = Reweighing(unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups)
dataset_transf = RW.fit_transform(dataset_orig)

In [12]:
metric_orig_train = BinaryLabelDatasetMetric(dataset_transf, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
display(Markdown("#### Original dataset"))
print("Disparate Imapct of unprivileged and privileged groups = %f" % metric_orig_train.disparate_impact())
print("SPD of unprivileged and privileged groups = %f" % metric_orig_train.statistical_parity_difference())

#### Original dataset

Disparate Imapct of unprivileged and privileged groups = 1.000000
SPD of unprivileged and privileged groups = -0.000000
