In [16]:
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import aif360
from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset,BankDataset

from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.algorithms.preprocessing import DisparateImpactRemover,LFR,OptimPreproc
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions\
        import load_preproc_data_adult, load_preproc_data_german, load_preproc_data_compas
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt


from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from aif360.examples.common_utils import compute_metrics


all_metrics =  ["Statistical parity difference",
                   "Average odds difference",
                   "Equal opportunity difference"]

In [None]:
def clean_csv(df, threshold=600):
    drop_cols = [num for num in df.columns if np.sum(df[num] == "?") > threshold]
    df_drop = df.drop(axis=1, columns=drop_cols)
    data = df_drop.replace("?", np.nan)
    print(f'Missing values: \n{data.isna().sum()}')
    return data

In [None]:
df = clean_csv(df)

In [None]:
def fill_nan(df):
    for col in df.columns:
        if df[col].dtypes == object:
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].mean())

    return df

In [None]:
df = fill_nan(df) 
df[30] = df[30].astype(float)

In [None]:
cat_cols = [col for col in df.columns if df[col].dtypes == object]

In [None]:
dataset = StandardDataset(df, 
                          label_name=label, 
                          favorable_classes=[favorable_classes], 
                          protected_attribute_names=[protected_attribute_names], 
                          privileged_classes=[[privileged_classes]])

In [None]:
aif360.datasets.StandardDataset(df, label_name, favorable_classes, protected_attribute_names, privileged_classes, instance_weights_name='', scores_name='', categorical_features=[], features_to_keep=[], features_to_drop=[], na_values=[], custom_preprocessing=None, metadata=None)




## Funções

In [2]:
from collections import OrderedDict
from aif360.metrics import ClassificationMetric

def compute_metrics(dataset_true, dataset_pred, 
                    unprivileged_groups, privileged_groups,
                    disp = True):
    """ Compute the key metrics """
    classified_metric_pred = ClassificationMetric(dataset_true,
                                                 dataset_pred, 
                                                 unprivileged_groups=unprivileged_groups,
                                                 privileged_groups=privileged_groups)
    metrics = OrderedDict()
    metrics["Balanced accuracy"] = 0.5*(classified_metric_pred.true_positive_rate()+
                                             classified_metric_pred.true_negative_rate())
    metrics["Statistical parity difference"] = classified_metric_pred.statistical_parity_difference()
    metrics["Disparate impact"] = classified_metric_pred.disparate_impact()
    metrics["Average odds difference"] = classified_metric_pred.average_odds_difference()
    metrics["Equal opportunity difference"] = classified_metric_pred.equal_opportunity_difference()
    metrics["Theil index"] = classified_metric_pred.theil_index()
    
    if disp:
        for k in metrics:
            print("%s = %.4f" % (k, metrics[k]))
    
    return metrics


def descript_dataset(dataset_orig_train):
    # print out some labels, names, etc.
    display(Markdown("#### Training Dataset shape"))
    print(dataset_orig_train.features.shape)
    display(Markdown("#### Favorable and unfavorable labels"))
    print(dataset_orig_train.favorable_label, dataset_orig_train.unfavorable_label)
    display(Markdown("#### Protected attribute names"))
    print(dataset_orig_train.protected_attribute_names)
    display(Markdown("#### Privileged and unprivileged protected attribute values"))
    print(dataset_orig_train.privileged_protected_attributes, 
          dataset_orig_train.unprivileged_protected_attributes)
    display(Markdown("#### Dataset feature names"))
    print(dataset_orig_train.feature_names)
    
def classify(dataset_orig, model_function, per_train = 0.7):
    # Get the dataset and split into train and test
    dataset_orig_train, dataset_orig_vt = dataset_orig.split([per_train], shuffle=True)
    
    # Logistic regression classifier and predictions
    scale_orig = StandardScaler()
    X_train = scale_orig.fit_transform(dataset_orig_train.features)
    y_train = dataset_orig_train.labels.ravel()
#     w_train = dataset_orig_train.instance_weights.ravel()
    
    model = model_function
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)

    # positive class index
    pos_ind = np.where(model.classes_ == dataset_orig_train.favorable_label)[0][0]

    dataset_orig_train_pred = dataset_orig_train.copy()
    dataset_orig_train_pred.labels = y_train_pred
    
    return model,dataset_orig_train_pred,dataset_orig_vt,scale_orig,pos_ind

def valid_test_classifier(dataset_orig_vt,lmod,scale_orig,pos_ind,per_valid = 0.5):
    
    dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([per_valid], shuffle=True)

    dataset_orig_valid_pred = dataset_orig_valid.copy(deepcopy=True)

    X_valid = scale_orig.transform(dataset_orig_valid_pred.features)
    y_valid = dataset_orig_valid_pred.labels
    dataset_orig_valid_pred.scores = lmod.predict_proba(X_valid)[:,pos_ind].reshape(-1,1)

    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = scale_orig.transform(dataset_orig_test_pred.features)
    y_test = dataset_orig_test_pred.labels
    dataset_orig_test_pred.scores = lmod.predict_proba(X_test)[:,pos_ind].reshape(-1,1)
    
    return dataset_orig_valid_pred,dataset_orig_test_pred,dataset_orig_test
    
def get_metrics(dataset_orig_test,dataset_orig_test_pred):
    fav_inds = dataset_orig_test_pred.scores > 0.5
    dataset_orig_test_pred.labels[fav_inds] = dataset_orig_test_pred.favorable_label
    dataset_orig_test_pred.labels[~fav_inds] = dataset_orig_test_pred.unfavorable_label


    metric_test_bef = compute_metrics(dataset_orig_test, dataset_orig_test_pred, 
                                      unprivileged_groups, privileged_groups,
                                      disp = True)

    



In [4]:
def load_german_dataset():
    
    dataset_orig = GermanDataset(
        protected_attribute_names=['age'],         
        privileged_classes=[lambda x: x >= 25],      # age >=25 is considered privileged
        features_to_drop=['personal_status', 'sex'],
        categorical_features=['status', 'credit_history', 'purpose',
                         'savings', 'employment', 'other_debtors', 'property',
                         'installment_plans', 'housing', 'skill_level', 'telephone',
                         'foreign_worker']# ignore sex-related attributes
        )


    
    return dataset_orig

def load_bank_dataset():
    
    dataset_orig = BankDataset(
        label_name='y', favorable_classes=['yes'],
                     protected_attribute_names=['age'],
                     privileged_classes=[lambda x: x >= 25],
                     instance_weights_name=None,
                     categorical_features=['job', 'marital', 'education', 'default',
                         'housing', 'loan', 'contact', 'month', 'day_of_week',
                         'poutcome'],
                     features_to_keep=[], features_to_drop=[],
                     na_values=["unknown"], custom_preprocessing=None,
                     metadata=None)

    
    return dataset_orig

def load_adult_dataset():

    
    default_mappings = {
        'label_maps': [{1.0: '>50K', 0.0: '<=50K'}],
        'protected_attribute_maps': [{1.0: 'White', 0.0: 'Non-white'},
                                     {1.0: 'Male', 0.0: 'Female'}]
        }
    dataset_orig = AdultDataset(label_name='income-per-year',
                 favorable_classes=['>50K', '>50K.'],
                 protected_attribute_names=['race', 'sex'],
                 privileged_classes=[['White'], ['Male']],
                 instance_weights_name=None,
                 categorical_features=['workclass', 'education',
                     'marital-status', 'occupation', 'relationship',
                     'native-country'],
                 features_to_keep=[], features_to_drop=['fnlwgt'],
                 na_values=['?'], custom_preprocessing=None,
                 metadata=default_mappings)

    
    return dataset_orig

def load_compas_dataset():
    

    default_mappings = {
        'label_maps': [{1.0: 'Did recid.', 0.0: 'No recid.'}],
        'protected_attribute_maps': [{0.0: 'Male', 1.0: 'Female'},
                                     {1.0: 'Caucasian', 0.0: 'Not Caucasian'}]
    }
    
    dataset_orig = CompasDataset(label_name='two_year_recid', favorable_classes=[0],
                 protected_attribute_names=['sex', 'race'],
                 privileged_classes=[['Female'], ['Caucasian']],
                 instance_weights_name=None,
                 categorical_features=['age_cat', 'c_charge_degree',
                     'c_charge_desc'],
                 features_to_keep=['sex', 'age', 'age_cat', 'race',
                     'juv_fel_count', 'juv_misd_count', 'juv_other_count',
                     'priors_count', 'c_charge_degree', 'c_charge_desc',
                     'two_year_recid'],
                 features_to_drop=[], na_values=[],
                 custom_preprocessing=default_preprocessing,
                 metadata=default_mappings)

    return dataset_orig
    

def default_preprocessing(df):
    """Perform the same preprocessing as the original analysis:
    https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
    """
    return df[(df.days_b_screening_arrest <= 30)
            & (df.days_b_screening_arrest >= -30)
            & (df.is_recid != -1)
            & (df.c_charge_degree != 'O')
            & (df.score_text != 'N/A')]



In [17]:
datasets = {'german':load_german_dataset(),
            'bank':load_bank_dataset(),
            'adult':load_adult_dataset(),
            'compas':load_compas_dataset()}

for name, loader in datasets.items():
    
    dataset_orig = loader
    print(name)
    
    dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True)

    X_train = dataset_orig_train.features
    y_train = dataset_orig_train.labels.ravel()

    X_test = dataset_orig_test.features
    y_test = dataset_orig_test.labels.ravel()

    models = [SVC(),DecisionTreeClassifier(),LogisticRegression()]

    for m in models:

        print('------------------ Sem pré-processamento -----------------')


        pipe = Pipeline([('scaler', StandardScaler()), ('model', m)])
        pipe.fit(X_train, y_train)
        y_test_pred = pipe.predict(X_test)

        dataset_orig_test_pred = dataset_orig_test.copy()
        dataset_orig_test_pred.labels = y_test_pred


        for attr in dataset_orig.protected_attribute_names:

            print(attr)

            idx = dataset_orig.protected_attribute_names.index(attr)
            privileged_groups =  [{attr:dataset_orig.privileged_protected_attributes[idx][0]}] 
            unprivileged_groups = [{attr:dataset_orig.unprivileged_protected_attributes[idx][0]}] 

            metrics = compute_metrics(dataset_orig_test, dataset_orig_test_pred, 
                                                  unprivileged_groups, privileged_groups,
                                                  disp = True)
            
#     break
        print('------------------ Com pré-processamento -----------------')
        
        attr = dataset_orig.protected_attribute_names[0]
        idx = dataset_orig.protected_attribute_names.index(attr)
        privileged_groups =  [{attr:dataset_orig.privileged_protected_attributes[idx][0]}] 
        unprivileged_groups = [{attr:dataset_orig.unprivileged_protected_attributes[idx][0]}] 
        

        pipe = Pipeline([('scaler', StandardScaler()),('rew',OptimPreproc()),('model', m)])

        pipe.fit(X_train, y_train)
        y_test_pred = pipe.predict(X_test)

        dataset_orig_test_pred = dataset_orig_test.copy()
        dataset_orig_test_pred.labels = y_test_pred

        metrics = compute_metrics(dataset_orig_test, dataset_orig_test_pred, 
                                              unprivileged_groups, privileged_groups,
                                              disp = True)
    
    break

    



german
------------------ Sem pré-processamento -----------------
age
Balanced accuracy = 0.6076
Statistical parity difference = -0.1131
Disparate impact = 0.8705
Average odds difference = -0.0731
Equal opportunity difference = -0.1203
Theil index = 0.1094
------------------ Com pré-processamento -----------------


TypeError: __init__() missing 2 required positional arguments: 'optimizer' and 'optim_options'

tentei com Reweighing(unprivileged_groups=unprivileged_groups,privileged_groups=privileged_groups), DisparateImpactRemover()

In [5]:
dataset_orig = load_german_dataset()

In [7]:
attr = dataset_orig.protected_attribute_names[0]
idx = dataset_orig.protected_attribute_names.index(attr)
privileged_groups =  [{attr:dataset_orig.privileged_protected_attributes[idx][0]}] 
unprivileged_groups = [{attr:dataset_orig.unprivileged_protected_attributes[idx][0]}] 

In [9]:
RW = Reweighing(unprivileged_groups=unprivileged_groups,
               privileged_groups=privileged_groups)
RW.fit(dataset_orig)
dataset_transf_train = RW.transform(dataset_orig)