In [1]:
import sys
sys.path.insert(1, './scripts')

from scan import *
from prep import *
import pandas as pd, numpy as np

# Read recipe inputs
german_prep_df = pd.read_csv("datasets/german_credit_prep.csv")
german_prep_df

Unnamed: 0.1,Unnamed: 0,under_25,sex,job,housing,savings,checking,credit_amt,duration,purpose,outcomes,proba_lr,proba_rf
0,0,False,Male,Multiple Jobs,Own,,Little,Moderate,Short,Radio/TV,0,0.114175,0.046250
1,1,True,Female,Multiple Jobs,Own,Little,Moderate,High,Very Long,Radio/TV,1,0.612611,0.803333
2,2,False,Male,1 Job,Own,Little,,Moderate,Moderate,Education,0,0.154985,0.092917
3,3,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Furniture/Equipment,0,0.545431,0.141350
4,4,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Car,1,0.574045,0.213719
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,False,Female,1 Job,Own,Little,,Moderate,Moderate,Furniture/Equipment,0,0.136923,0.122083
996,996,False,Male,Multiple Jobs,Own,Little,Little,High,Long,Car,0,0.517829,0.687929
997,997,False,Male,Multiple Jobs,Own,Little,,Low,Moderate,Radio/TV,0,0.135555,0.010000
998,998,True,Male,Multiple Jobs,Free,Little,Little,Moderate,Long,Radio/TV,1,0.550995,0.829405


Select only covariates to prepare for simulating probabilities and outcomes.

In [2]:
german_mit_df = german_prep_df
german_mit_df

Unnamed: 0.1,Unnamed: 0,under_25,sex,job,housing,savings,checking,credit_amt,duration,purpose,outcomes,proba_lr,proba_rf
0,0,False,Male,Multiple Jobs,Own,,Little,Moderate,Short,Radio/TV,0,0.114175,0.046250
1,1,True,Female,Multiple Jobs,Own,Little,Moderate,High,Very Long,Radio/TV,1,0.612611,0.803333
2,2,False,Male,1 Job,Own,Little,,Moderate,Moderate,Education,0,0.154985,0.092917
3,3,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Furniture/Equipment,0,0.545431,0.141350
4,4,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Car,1,0.574045,0.213719
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,False,Female,1 Job,Own,Little,,Moderate,Moderate,Furniture/Equipment,0,0.136923,0.122083
996,996,False,Male,Multiple Jobs,Own,Little,Little,High,Long,Car,0,0.517829,0.687929
997,997,False,Male,Multiple Jobs,Own,Little,,Low,Moderate,Radio/TV,0,0.135555,0.010000
998,998,True,Male,Multiple Jobs,Free,Little,Little,Moderate,Long,Radio/TV,1,0.550995,0.829405


In [3]:
# specify probability and outcomes columns
PROBA_CONFUSION_COL = 'proba_lr'
PROBA_IJDI_COL = 'proba_rf'
OUTCOMES_COL = 'outcomes'
FEATURES = ['under_25', 'sex', 'job', 'housing', 'savings', 'checking',
            'credit_amt', 'duration', 'purpose']
THRESHOLD = 0.5
LAMBDA_PARAM = 1
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

In [4]:
# # filter for only negative outcomes
# negatives_df = german_mit_df.loc[german_mit_df[OUTCOMES_COL] == 0]

# # scan with positive direction
# current_subset, current_score = run_ijdi_scan(negatives_df, FEATURES, PROBA_CONFUSION_COL, PROBA_IJDI_COL, OUTCOMES_COL,
#                                               THRESHOLD, LAMBDA_PARAM)
# # summarize_scan(negatives_df, FEATURES, PROBA_COL, OUTCOMES_COL, current_subset, include='all')\
# current_subset, current_score

### Implement Mitigation Approach 2

Set lambda vales to test.

In [5]:
lambda_vals = [0, 0.3, 1, 3, 10]
# n_iters = 1

# specify parameters for generating metrics and IJDI scan
proba_confusion_col = 'proba_lr'
proba_ijdi_col = 'proba_rf'
outcomes_col = 'outcomes'
features = ['under_25', 'sex', 'job', 'housing', 'savings', 'checking',
            'credit_amt', 'duration', 'purpose']
threshold = 'threshold'

# define stopping criteria
stop_limit = 0
max_iters = 20

pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# set random seed
np.random.seed(100)

In [6]:
sim_data = []

Implement approach for negatives

In [7]:
# filter for negative or positive outcomes
negatives_df = german_mit_df.loc[german_mit_df[outcomes_col] == 0]

for lambda_param in lambda_vals: # run IJDI scan for various lambda values

    print("Lambda =", lambda_param)

    i = 0
    ijdi_present = True
    # set initial threshold
    negatives_df['threshold'] = 0.5

    while ijdi_present and i <= max_iters:

        i += 1
        print("Iteration", i)

        # Run Negative or Positive IJDI Scan. Make sure to pass in copy because data may be modified by the function!
        current_subset, current_score = run_ijdi_scan(negatives_df.copy(deep=True), features, proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                      threshold, lambda_param, constant_threshold=False, verbose=True)

        print("Score for Negative IJDI-Scan:", current_score)

        # append data
        row = [lambda_param, i, current_score, 'negative']
        sim_data.append(row)

        print(row)

        if current_score > stop_limit:

            # correct IJDI by setting new threshold for subset
            negatives_df['threshold'] = correct_ijdi_subset(negatives_df.copy(deep=True), features,
                                                            proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                            threshold, lambda_param, current_subset)
            print(negatives_df['threshold'])

        else:
            ijdi_present = False

        print("\n----------------------------------------------------\n")

Lambda = 0
Iteration 1
Subset found on iteration 1 of 10 with score 47.03018853370144 :
{'purpose': ['Business', 'Car', 'Education', 'Furniture/Equipment', 'Radio/TV'], 'duration': ['Long', 'Moderate', 'Very Long'], 'job': ['1 Job', 'Multiple Jobs'], 'checking': ['Little', 'Moderate'], 'savings': ['Little', 'Moderate'], 'housing': ['Free', 'Rent']}
Best score is now 47.03018853370144
Subset found on iteration 2 of 10 with score 53.50569210408128 :
{'checking': ['Little'], 'savings': ['Little', 'Moderate'], 'duration': ['Long', 'Moderate', 'Very Long']}
Best score is now 53.50569210408128
Subset found on iteration 3 of 10 with score 46.86065186046361 :
{'duration': ['Long', 'Very Long'], 'checking': ['Little', 'Moderate'], 'savings': ['Little', 'Moderate']}
Current score of 46.86065186046361 does not beat best score of 53.50569210408128
Subset found on iteration 4 of 10 with score 47.03018853370144 :
{'purpose': ['Business', 'Car', 'Education', 'Furniture/Equipment', 'Radio/TV'], 'job':

In [8]:
columns = ['lambda', 'iteration', 'score', 'scan_type']
mit_result_df = pd.DataFrame(sim_data, columns=columns)
mit_result_df

Unnamed: 0,lambda,iteration,score,scan_type
0,0.0,1,53.505692,negative
1,0.0,2,27.776997,negative
2,0.0,3,21.257761,negative
3,0.0,4,14.793836,negative
4,0.0,5,11.415168,negative
...,...,...,...,...
60,10.0,2,14.453913,negative
61,10.0,3,5.773746,negative
62,10.0,4,3.364434,negative
63,10.0,5,0.201712,negative


In [9]:
# Write recipe outputs
german_df.to_csv("datasets/german_credit_mit_2_neg.csv")

NameError: name 'german_df' is not defined