### Import Libraries

In [None]:
### Dataiku-Specific Start ###
import dataiku
from dataiku import pandasutils as pdu
from scan import *
from prep import *
### Dataiku-Specific End ###

import pandas as pd, numpy as np

In [None]:
### Colab-Specific Start ###
# from google.colab import drive
# drive.mount('/content/drive')
# from scan import *
# from prep import *
### Colab-Specific End ###

### Prepare Base Dataset for Mitigation

Read in the prepared COMPAS dataset as a dataframe.

In [None]:
### Dataiku-Specific Start ###
german_credit_prep = dataiku.Dataset("german_credit_prep")
german_prep_df = german_credit_prep.get_dataframe()
### Dataiku-Specific End ###

### Colab-Specific Start ###
# dataset_path = "drive/MyDrive/IJDI/compas_prep.csv"
# compas_df = pd.read_csv(dataset_path)
### Colab-Specific End ###

Select only covariates to prepare for simulating probabilities and outcomes.

In [None]:
german_mit_df = german_prep_df
german_mit_df

In [None]:
# specify probability and outcomes columns
PROBA_CONFUSION_COL = 'proba_lr'
PROBA_IJDI_COL = 'proba_rf'
OUTCOMES_COL = 'outcomes'
FEATURES = ['under_25', 'sex', 'job', 'housing', 'savings', 'checking',
            'credit_amt', 'duration', 'purpose']
THRESHOLD = 0.5
LAMBDA_PARAM = 1
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

In [None]:
# # filter for only negative outcomes
# negatives_df = german_mit_df.loc[german_mit_df[OUTCOMES_COL] == 0]

# # scan with positive direction
# current_subset, current_score = run_ijdi_scan(negatives_df, FEATURES, PROBA_CONFUSION_COL, PROBA_IJDI_COL, OUTCOMES_COL,
#                                               THRESHOLD, LAMBDA_PARAM)
# # summarize_scan(negatives_df, FEATURES, PROBA_COL, OUTCOMES_COL, current_subset, include='all')\
# current_subset, current_score

### Implement Mitigation Approach 2

Set lambda vales to test.

In [None]:
lambda_vals = [0, 0.3, 1, 3, 10]
# n_iters = 1

# specify parameters for generating metrics and IJDI scan
proba_confusion_col = 'proba_lr'
proba_ijdi_col = 'proba_rf'
outcomes_col = 'outcomes'
features = ['under_25', 'sex', 'job', 'housing', 'savings', 'checking',
            'credit_amt', 'duration', 'purpose']
threshold = 'threshold'

# define stopping criteria
stop_limit = 0
max_iters = 5


pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# set random seed
np.random.seed(100)

In [None]:
sim_data = []

Implement approach for negatives

In [None]:
# filter for negative or positive outcomes
negatives_df = german_mit_df.loc[german_mit_df[outcomes_col] == 0]

for lambda_param in lambda_vals: # run IJDI scan for various lambda values

    print("Lambda =", lambda_param)

    i = 0
    ijdi_present = True
    # set initial threshold
    negatives_df['threshold'] = 0.5

    while ijdi_present and i <= max_iters:

        i += 1
        print("Iteration", i)

        # Run Negative or Positive IJDI Scan. Make sure to pass in copy because data may be modified by the function!
        current_subset, current_score = run_ijdi_scan(negatives_df.copy(deep=True), features, proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                      threshold, lambda_param, constant_threshold=False, verbose=True)

        print("Score for Negative IJDI-Scan:", current_score)

        # append data
        row = [lambda_param, i, current_score, 'negative']
        sim_data.append(row)

        print(row)

        if current_score > stop_limit:

            # correct IJDI by setting new threshold for subset
            negatives_df['threshold'] = correct_ijdi_subset(negatives_df.copy(deep=True), features,
                                                            proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                            threshold, lambda_param, current_subset)
            print(negatives_df['threshold'])

        else:
            ijdi_present = False

        print("\n----------------------------------------------------\n")

Implement approach for Positives

In [None]:
# filter for negative or positive outcomes
positives_df = german_mit_df.loc[german_mit_df[outcomes_col] == 1]

for lambda_param in lambda_vals: # run IJDI scan for various lambda values

    print("Lambda =", lambda_param)

    i = 0
    ijdi_present = True
    # set initial threshold
    positives_df['threshold'] = 0.5

    while ijdi_present and i <= 20:

        i += 1
        print("Iteration", i)

        # Run Negative or Positive IJDI Scan. Make sure to pass in copy because data may be modified by the function!
        current_subset, current_score = run_ijdi_scan(positives_df.copy(deep=True), features, proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                      threshold, lambda_param, constant_threshold=False, verbose=True)

        print("Score for Positives IJDI-Scan:", current_score)

        # append data
        row = [lambda_param, i, current_score, 'positive']
        sim_data.append(row)

        print(row)

        if current_score > stop_limit:

            # correct IJDI by setting new threshold for subset
            positives_df['threshold'] = correct_ijdi_subset(positives_df.copy(deep=True), features,
                                                            proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                            threshold, lambda_param, current_subset)
            print(positives_df['threshold'])

        else:
            ijdi_present = False

        print("\n----------------------------------------------------\n")

In [None]:
columns = ['lambda', 'iteration', 'score', 'scan_type']
mit_result_df = pd.DataFrame(sim_data, columns=columns)
mit_result_df

In [None]:
# Write recipe outputs
german_credit_mit_2 = dataiku.Dataset("german_credit_mit_2")
german_credit_mit_2.write_with_schema(mit_result_df)