### Import Libraries

In [0]:
### Dataiku-Specific Start ###
import dataiku
from dataiku import pandasutils as pdu
from scan import *
from prep import *
from subset import *
### Dataiku-Specific End ###

import pandas as pd, numpy as np

In [0]:
### Colab-Specific Start ###
# from google.colab import drive
# drive.mount('/content/drive')
# from scan import *
# from prep import *
### Colab-Specific End ###

### Prepare Base Dataset for Mitigation

Read in the prepared COMPAS dataset as a dataframe.

In [0]:
### Dataiku-Specific Start ###
compas_prep = dataiku.Dataset("compas_prep")
compas_df = compas_prep.get_dataframe()
### Dataiku-Specific End ###

### Colab-Specific Start ###
# dataset_path = "drive/MyDrive/IJDI/compas_prep.csv"
# compas_df = pd.read_csv(dataset_path)
### Colab-Specific End ###

In [0]:
compas_df

### Mitigation Thresholds

a) 0.45 threshold for everyone (base case)
b) 0.5 threshold for Black individuals, 0.45 for everyone else (increased threshold for Black defendants)
c) 0.45 threshold for Black individuals, 0.4 for everyone else (decreased threshold for non-Black defendants)
d) 0.5 threshold for Black individuals, 0.4 for everyone else (increased threshold for Black defendants and decreased threshold for non-Black defendants)

In [0]:
threshold_labels = ['a', 'b', 'c', 'd']
threshold_settings = {'a' : (0.45, 0.45),
                      'b' : (0.5, 0.45),
                      'c' : (0.45, 0.4),
                      'd' : (0.5, 0.4)}
for label in threshold_labels:
    setting = threshold_settings[label]
    compas_df['threshold_' + label] = compas_df['race'].apply(lambda x : setting[0] if x == 'African-American'
                                                                                    else setting[1])
compas_df

In [0]:
# specify probability and outcomes columns
PROBA_COL = 'proba_compas'
OUTCOMES_COL = 'outcomes'
FEATURES = ['sex', 'race', 'under_25', 'charge_degree', 'prior_offenses']
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# filter for negative and positive outcomes
negatives_df = compas_df.loc[compas_df[OUTCOMES_COL] == 0]
positives_df = compas_df.loc[compas_df[OUTCOMES_COL] == 1]

In [0]:
# report FPR and TPR of African-Americans and non African-Americans
for label in threshold_labels:
    negative_metrics_df = generate_metrics(negatives_df, PROBA_COL, OUTCOMES_COL,
                                           'threshold_' + label, constant_threshold=False)
    positive_metrics_df = generate_metrics(positives_df, PROBA_COL, OUTCOMES_COL,
                                           'threshold_' + label, constant_threshold=False)
    negative_metrics_aa_df = negative_metrics_df.loc[negative_metrics_df['race'] == 'African-American']
    negative_metrics_other_df = negative_metrics_df.loc[negative_metrics_df['race'] != 'African-American']
    positive_metrics_aa_df = positive_metrics_df.loc[positive_metrics_df['race'] == 'African-American']
    positive_metrics_other_df = positive_metrics_df.loc[positive_metrics_df['race'] != 'African-American']
    print("FPR of African-Americans", negative_metrics_aa_df['positives'].sum() / len(negative_metrics_aa_df))
    print("FPR of non African-Americans", negative_metrics_other_df['positives'].sum() / len(negative_metrics_other_df))
    print("TPR of African-Americans", positive_metrics_aa_df['positives'].sum() / len(positive_metrics_aa_df))
    print("TPR of non African-Americans", positive_metrics_other_df['positives'].sum() / len(positive_metrics_other_df))

### Mitigation Results

Set lambda vales to test for each case.

In [0]:
lambda_vals = [0, 0.3, 1, 3, 10]
race_subset = {'race': ['African-American']}
n_iters = 1

# specify parameters for generating metrics and IJDI scan
proba_confusion_col = 'proba_compas'
proba_ijdi_col = 'proba_actual'
outcomes_col = 'outcomes'
features = ['sex', 'race', 'under_25', 'charge_degree', 'prior_offenses']

pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# set random seed
np.random.seed(100)

In [0]:
sim_data = []

for label in threshold_labels:

    setting = threshold_settings[label]
    threshold = 'threshold_' + label
    print("Thresholds: African-Americans {}, All Other Races {}".format(setting[0], setting[1]))

    for i in range(n_iters): # run n iterations for each k value

        print("Iteration", i+1, "of", n_iters)

        # filter for negative and positive outcomes
        negatives_df = compas_df.loc[compas_df[OUTCOMES_COL] == 0]
        positives_df = compas_df.loc[compas_df[OUTCOMES_COL] == 1]

        for lambda_param in lambda_vals: # run IJDI scan for various lambda values

            print("Lambda =", lambda_param)

            # Run Negative and Positive IJDI Scan. Make sure to pass in copy because data may be modified by the function!
            negative_score = score_current_subset_ijdi(negatives_df.copy(deep=True), features, proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                       threshold, lambda_param, race_subset, constant_threshold=False, verbose=True)

            print("Score for Negative IJDI-Scan:", negative_score)

            positive_score = score_current_subset_ijdi(positives_df.copy(deep=True), features, proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                       threshold, lambda_param, race_subset, constant_threshold=False, verbose=True)

            print("Score Positive IJDI-Scan:", positive_score)

            # append data
            negative_row = [setting[0], setting[1], lambda_param, negative_score, 'negative']
            positive_row = [setting[0], setting[1], lambda_param, positive_score, 'positive']
            sim_data.append(negative_row)
            sim_data.append(positive_row)

            print(negative_row)
            print(positive_row)

            print("\n----------------------------------------------------\n")

In [0]:
columns = ['threshold_aa', 'threshold_other', 'lambda', 'score', 'scan_type']
mit_result_df = pd.DataFrame(sim_data, columns=columns)
mit_result_df

In [0]:
### Dataiku-Specific Start ###
compas_mit_1 = dataiku.Dataset("compas_mit_1")
compas_mit_1.write_with_schema(mit_result_df)
### Dataiku-Specific End ###

### Colab-Specific Start ###
# dataset_path = "drive/MyDrive/IJDI/compas_prep.csv"
# compas_prep_df.to_csv(dataset_path)
### Colab-Specific End ###