In [1]:
import os

os.chdir('../scripts')

from scan import *
from prep import *
import pandas as pd, numpy as np

os.chdir('../compas')

# Read recipe inputs
compas_prep_df = pd.read_csv("datasets/compas_prep.csv")
compas_prep_df

Unnamed: 0.1,Unnamed: 0,sex,race,prior_offenses,under_25,charge_degree,outcomes,compas_risk_score,proba_compas,proba_lr
0,0,Male,Other,,False,F,0,1,0.213889,0.232134
1,1,Male,African-American,,False,F,1,3,0.376171,0.289774
2,2,Male,African-American,1 to 5,True,F,1,4,0.434330,0.668642
3,3,Male,African-American,1 to 5,True,F,0,8,0.683594,0.668642
4,4,Male,Other,1 to 5,False,F,0,1,0.213889,0.373653
...,...,...,...,...,...,...,...,...,...,...
7209,7209,Male,African-American,,True,F,0,7,0.591216,0.505581
7210,7210,Male,African-American,,True,F,0,3,0.376171,0.505581
7211,7211,Male,Other,,False,F,0,1,0.213889,0.232134
7212,7212,Female,African-American,1 to 5,False,M,0,2,0.311371,0.335035


### Prepare Base Dataset for Simulations

Select only covariates to prepare for simulating probabilities and outcomes.

In [2]:
# Exclude decile score from covariates
FEATURES = ['sex', 'race', 'under_25', 'charge_degree', 'prior_offenses']
compas_sim_df = compas_prep_df[FEATURES]
compas_sim_df

Unnamed: 0,sex,race,under_25,charge_degree,prior_offenses
0,Male,Other,False,F,
1,Male,African-American,False,F,
2,Male,African-American,True,F,1 to 5
3,Male,African-American,True,F,1 to 5
4,Male,Other,False,F,1 to 5
...,...,...,...,...,...
7209,Male,African-American,True,F,
7210,Male,African-American,True,F,
7211,Male,Other,False,F,
7212,Female,African-American,False,M,1 to 5


Define subgroup for which to simulate altered probabilities and outcomes.

In [3]:
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment
subgroup = {'race':['African-American'], 'sex':['Male'], 'under_25':[False]}
compas_sim_df['in_subgroup'] = compas_sim_df[list(subgroup.keys())].isin(subgroup).all(axis=1)

### Example of one Iteration for Base Case of the Simulation

Define mean for altered group and remaining population.

In [4]:
subgroup_mu = 0.51
non_subgroup_mu = 0.49
k = 1

In [5]:
compas_sim_df['proba'] = compas_sim_df['in_subgroup'].apply(lambda x : generate_proba(x, subgroup_mu, non_subgroup_mu, k))
compas_sim_df['outcomes'] = compas_sim_df['proba'].apply(lambda x : generate_outcomes(x))
compas_sim_df

Unnamed: 0,sex,race,under_25,charge_degree,prior_offenses,in_subgroup,proba,outcomes
0,Male,Other,False,F,,False,0.498077,1
1,Male,African-American,False,F,,True,0.504185,1
2,Male,African-American,True,F,1 to 5,False,0.491621,1
3,Male,African-American,True,F,1 to 5,False,0.490337,1
4,Male,Other,False,F,1 to 5,False,0.494247,1
...,...,...,...,...,...,...,...,...
7209,Male,African-American,True,F,,False,0.486596,1
7210,Male,African-American,True,F,,False,0.487274,0
7211,Male,Other,False,F,,False,0.490246,1
7212,Female,African-American,False,M,1 to 5,False,0.488437,0


Generate variant metrics for only "truly negative-outcome" people (those who did NOT reoffend). Additionally, generate metrics required for IJDI scan (i.e. $\hat{p}$ calculation).

In [6]:
# specify probability and outcomes columns
PROBA_COL = 'proba'
OUTCOMES_COL = 'outcomes'
FEATURES = ['sex', 'race', 'under_25', 'charge_degree']
THRESHOLD = 0.5
LAMBDA_PARAM = 49
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

In [7]:
# metrics_df = generate_metrics(negatives_df, PROBA_COL, OUTCOMES_COL, THRESHOLD)

# print("Number of people in subgroup:", len(metrics_df[metrics_df['in_subgroup'] == True]))
# print("Number of people not in subgroup:", len(metrics_df[metrics_df['in_subgroup'] == False]))
# print("Total population:", len(metrics_df))

# # display preview of treatments, probabilities, and outcomes
# metrics_df.head(10)

In [8]:
# filter for only positive outcomes
positives_df = compas_sim_df.loc[compas_sim_df[OUTCOMES_COL] == 1]

# scan with positive direction
current_subset, current_score = run_ijdi_scan(positives_df, FEATURES, PROBA_COL, PROBA_COL, OUTCOMES_COL,
                                              THRESHOLD, LAMBDA_PARAM)
# summarize_scan(positives_df, FEATURES, PROBA_COL, OUTCOMES_COL, current_subset, include='all')\
current_subset, current_score

Subset found on iteration 1 of 10 with score 166.1437853686486 :
{'under_25': [False], 'race': ['African-American'], 'sex': ['Male']}
Best score is now 166.1437853686486
Subset found on iteration 2 of 10 with score 166.1437853686486 :
{'under_25': [False], 'sex': ['Male'], 'race': ['African-American']}
Current score of 166.1437853686486 does not beat best score of 166.1437853686486
Subset found on iteration 3 of 10 with score 166.1437853686486 :
{'sex': ['Male'], 'under_25': [False], 'race': ['African-American']}
Current score of 166.1437853686486 does not beat best score of 166.1437853686486
Subset found on iteration 4 of 10 with score 166.1437853686486 :
{'under_25': [False], 'sex': ['Male'], 'race': ['African-American']}
Current score of 166.1437853686486 does not beat best score of 166.1437853686486
Subset found on iteration 5 of 10 with score 166.1437853686486 :
{'sex': ['Male'], 'race': ['African-American'], 'under_25': [False]}
Current score of 166.1437853686486 does not beat be

({}, 0.0)

### Simulations for Negatives

Set mean values as well as ranges of standard deviation and lambda to iterate over for each simulation. Also define threshold and number of iterations to run for each set of parameters.

In [9]:
subgroup_mu = 0.51
non_subgroup_mu = 0.49
k_vals = [0, 1, 3, 10]
lambda_vals = [0, 4, 8, 12, 16, 20, 42, 46, 50, 54, 80]
n_iters = 1

# specify parameters for generating metrics and IJDI scan
proba_col = 'proba'
outcomes_col = 'outcomes'
features = ['sex', 'race', 'under_25', 'charge_degree', 'prior_offenses']
threshold = 0.5
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# set random seed
np.random.seed(100)

In [None]:
sim_data = []

for k in k_vals:

    print("Parameter k =", k)

    for i in range(n_iters): # run n iterations for each k value

        print("Simulation", i+1, "of", n_iters)

        # generate probabilities from normal distribution (with edge boundary of 0 and 1) and outcomes from bernoulli random variable
        compas_sim_df[proba_col] = compas_sim_df['in_subgroup'].apply(lambda x : generate_proba(x, subgroup_mu, non_subgroup_mu, k))
        compas_sim_df[outcomes_col] = compas_sim_df[proba_col].apply(lambda x : generate_outcomes(x))

        # filter for only negative outcomes
        negatives_df = compas_sim_df.loc[compas_sim_df[outcomes_col] == 0]

        for lambda_param in lambda_vals: # run IJDI scan for various lambda values

            print("Lambda =", lambda_param)

            # Run IJDI Scan. Make sure to pass in copy because data may be modified by the function!
            current_subset, current_score = run_ijdi_scan(negatives_df.copy(deep=True), features, proba_col, proba_col, outcomes_col,
                                                          threshold, lambda_param, verbose=True)

            # save iou and score
            if current_subset:
                in_subgroup = negatives_df[list(subgroup.keys())].isin(subgroup).all(axis=1)
                in_current_subset = negatives_df[list(current_subset.keys())].isin(current_subset).all(axis=1)
                iou = (in_subgroup & in_current_subset).sum() / (in_subgroup | in_current_subset).sum()
            else:
                iou = 0.0

            print("Detected subset intersection over union with expected subset:", iou)
            print("Detected subset score:", current_score)

            # append data
            sim_row = [k, lambda_param, iou, current_score]
            sim_data.append(sim_row)

            print(sim_row)
            print("\n----------------------------------------------------\n")

Parameter k = 0
Simulation 1 of 1
Lambda = 42
Subset found on iteration 1 of 10 with score 131.7906982402048 :
{'sex': ['Male'], 'race': ['African-American'], 'under_25': [False]}
Best score is now 131.7906982402048
Subset found on iteration 2 of 10 with score 131.7906982402048 :
{'sex': ['Male'], 'under_25': [False], 'race': ['African-American']}
Current score of 131.7906982402048 does not beat best score of 131.7906982402048
Subset found on iteration 3 of 10 with score 131.7906982402048 :
{'sex': ['Male'], 'race': ['African-American'], 'under_25': [False]}
Current score of 131.7906982402048 does not beat best score of 131.7906982402048
Subset found on iteration 4 of 10 with score 131.7906982402048 :
{'sex': ['Male'], 'under_25': [False], 'race': ['African-American']}
Current score of 131.7906982402048 does not beat best score of 131.7906982402048
Subset found on iteration 5 of 10 with score 131.7906982402048 :
{'under_25': [False], 'sex': ['Male'], 'race': ['African-American']}
Curre

In [None]:
columns = ['k', 'lambda', 'iou', 'score']
sim_result_df = pd.DataFrame(sim_data, columns=columns)
sim_result_df

In [None]:
# Write recipe outputs
mit_result_df.to_csv(home_dir + "datasets/compas_sim_1_neg.csv")