In [1]:
import os

os.chdir('../scripts') # cd into scripts dir

from scan import *
from prep import *
import pandas as pd, numpy as np

os.chdir('../') # cd back to parent dir
home_dir = "compas/" # set home dir for reading and storing data

# Read recipe inputs
compas_mit_df = pd.read_csv(home_dir + "datasets/compas_prep.csv")
compas_mit_df

Unnamed: 0.1,Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,compas_risk_score,proba_compas,proba_lr
0,0,Male,Other,,False,F,0,1,0.213889,0.232134
1,1,Male,African-American,,False,F,1,3,0.376171,0.289774
2,2,Male,African-American,1 to 5,True,F,1,4,0.434330,0.668642
3,3,Male,African-American,1 to 5,True,F,0,8,0.683594,0.668642
4,4,Male,Other,1 to 5,False,F,0,1,0.213889,0.373653
...,...,...,...,...,...,...,...,...,...,...
7209,7209,Male,African-American,,True,F,0,7,0.591216,0.505581
7210,7210,Male,African-American,,True,F,0,3,0.376171,0.505581
7211,7211,Male,Other,,False,F,0,1,0.213889,0.232134
7212,7212,Female,African-American,1 to 5,False,M,0,2,0.311371,0.335035


Test running positive IJDI-Scan once.

In [2]:
PROBA_CONFUSION_COL = 'proba_compas'
PROBA_IJDI_COL = 'proba_lr'
OUTCOMES_COL = 'outcomes'
FEATURES = ['sex', 'race', 'under_25', 'charge_degree', 'prior_offenses']
THRESHOLD = 0.5
LAMBDA_PARAM = 1
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

In [3]:
# filter for only positive outcomes
positives_df = compas_mit_df.loc[compas_mit_df[OUTCOMES_COL] == 1]

# scan with positive direction
current_subset, current_score = run_ijdi_scan(positives_df, FEATURES, PROBA_CONFUSION_COL, PROBA_IJDI_COL, OUTCOMES_COL,
                                              THRESHOLD, LAMBDA_PARAM)
# summarize_scan(positives_df, FEATURES, PROBA_COL, OUTCOMES_COL, current_subset, include='all')\
current_subset, current_score

Subset found on iteration 1 of 10 with score 23.79444568654959 :
{'race': ['African-American', 'Native American'], 'prior_offenses': ['Over 5']}
Best score is now 23.79444568654959
Subset found on iteration 2 of 10 with score 23.79444568654959 :
{'race': ['African-American', 'Native American'], 'prior_offenses': ['Over 5']}
Current score of 23.79444568654959 does not beat best score of 23.79444568654959
Subset found on iteration 3 of 10 with score 1.9769395249199533 :
{'under_25': [True], 'charge_degree': ['M'], 'prior_offenses': ['Over 5']}
Current score of 1.9769395249199533 does not beat best score of 23.79444568654959
Subset found on iteration 4 of 10 with score 7.708451583477554 :
{'race': ['African-American', 'Caucasian', 'Native American'], 'charge_degree': ['F'], 'sex': ['Female']}
Current score of 7.708451583477554 does not beat best score of 23.79444568654959
Subset found on iteration 5 of 10 with score 23.79444568654959 :
{'race': ['African-American', 'Native American'], 'pr

({'race': ['African-American', 'Native American'],
  'prior_offenses': ['Over 5']},
 23.79444568654959)

### Implement Mitigation Approach 2

Set lambda vales to test.

In [4]:
lambda_vals = [0, 0.3, 1, 3, 10]

# specify parameters for generating metrics and IJDI scan
proba_confusion_col = 'proba_compas'
proba_ijdi_col = 'proba_lr'
outcomes_col = 'outcomes'
features = ['sex', 'race', 'under_25', 'charge_degree', 'prior_offenses']
threshold = 'threshold'

# define stopping criteria
stop_limit = 0
max_iters = 3

pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# set random seed
np.random.seed(100)

In [5]:
sim_data = []

Implement approach for Positives

In [6]:
# filter for negative or positive outcomes
positives_df = compas_mit_df.loc[compas_mit_df[outcomes_col] == 1]

for lambda_param in lambda_vals: # run IJDI scan for various lambda values

    print("Lambda =", lambda_param)

    i = 0
    ijdi_present = True
    # set initial threshold
    positives_df['threshold'] = 0.5

    while ijdi_present and i <= max_iters:

        i += 1
        print("Iteration", i)

        # Run Negative or Positive IJDI Scan. Make sure to pass in copy because data may be modified by the function!
        current_subset, current_score = run_ijdi_scan(positives_df.copy(deep=True), features, proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                      threshold, lambda_param, constant_threshold=False, verbose=True)

        print("Score for Positives IJDI-Scan:", current_score)

        # append data
        row = [lambda_param, i, current_score, 'positive']
        sim_data.append(row)

        print(row)

        if current_score > stop_limit:

            # correct IJDI by setting new threshold for subset
            positives_df['threshold'] = correct_ijdi_subset(positives_df.copy(deep=True), features,
                                                            proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                            threshold, lambda_param, current_subset)
            print(positives_df['threshold'])

        else:
            ijdi_present = False

        print("\n----------------------------------------------------\n")

Lambda = 0
Iteration 1
Subset found on iteration 1 of 10 with score 146.34697806274693 :
{'prior_offenses': ['Over 5'], 'race': ['African-American', 'Native American']}
Best score is now 146.34697806274693
Subset found on iteration 2 of 10 with score 146.34697806274693 :
{'race': ['African-American', 'Native American'], 'prior_offenses': ['Over 5']}
Current score of 146.34697806274693 does not beat best score of 146.34697806274693
Subset found on iteration 3 of 10 with score 146.34697806274693 :
{'prior_offenses': ['Over 5'], 'race': ['African-American', 'Native American']}
Current score of 146.34697806274693 does not beat best score of 146.34697806274693
Subset found on iteration 4 of 10 with score 146.34697806274693 :
{'prior_offenses': ['Over 5'], 'race': ['African-American', 'Native American']}
Current score of 146.34697806274693 does not beat best score of 146.34697806274693
Subset found on iteration 5 of 10 with score 146.34697806274693 :
{'prior_offenses': ['Over 5'], 'race': ['

In [7]:
columns = ['lambda', 'iteration', 'score', 'scan_type']
mit_result_df = pd.DataFrame(sim_data, columns=columns)
mit_result_df

Unnamed: 0,lambda,iteration,score,scan_type
0,0.0,1,146.346978,positive
1,0.0,2,72.243374,positive
2,0.0,3,42.166561,positive
3,0.0,4,22.575105,positive
4,0.3,1,99.86068,positive
5,0.3,2,49.666885,positive
6,0.3,3,29.740165,positive
7,0.3,4,18.863771,positive
8,1.0,1,23.794446,positive
9,1.0,2,9.018073,positive


In [8]:
# Write recipe outputs
mit_result_df.to_csv(home_dir + "datasets/compas_mit_2_pos.csv")