In [1]:
import os

os.chdir('../scripts') # cd into scripts dir

from scan import *
from prep import *
import pandas as pd, numpy as np

os.chdir('../') # cd back to parent dir
home_dir = "german_credit/" # set home dir for reading and storing data

# Read recipe inputs
german_mit_df = pd.read_csv(home_dir + "datasets/german_credit_prep.csv")
german_mit_df

Unnamed: 0.1,Unnamed: 0,under_25,sex,job,housing,savings,checking,credit_amt,duration,purpose,outcomes,proba_lr,proba_rf
0,0,False,Male,Multiple Jobs,Own,,Little,Moderate,Short,Radio/TV,0,0.114175,0.046250
1,1,True,Female,Multiple Jobs,Own,Little,Moderate,High,Very Long,Radio/TV,1,0.612611,0.803333
2,2,False,Male,1 Job,Own,Little,,Moderate,Moderate,Education,0,0.154985,0.092917
3,3,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Furniture/Equipment,0,0.545431,0.141350
4,4,False,Male,Multiple Jobs,Free,Little,Little,High,Long,Car,1,0.574045,0.213719
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,False,Female,1 Job,Own,Little,,Moderate,Moderate,Furniture/Equipment,0,0.136923,0.122083
996,996,False,Male,Multiple Jobs,Own,Little,Little,High,Long,Car,0,0.517829,0.687929
997,997,False,Male,Multiple Jobs,Own,Little,,Low,Moderate,Radio/TV,0,0.135555,0.010000
998,998,True,Male,Multiple Jobs,Free,Little,Little,Moderate,Long,Radio/TV,1,0.550995,0.829405


Test running positive IJDI-Scan once.

In [3]:
# specify probability and outcomes columns
PROBA_CONFUSION_COL = 'proba_lr'
PROBA_IJDI_COL = 'proba_rf'
OUTCOMES_COL = 'outcomes'
FEATURES = ['under_25', 'sex', 'job', 'housing', 'savings', 'checking',
            'credit_amt', 'duration', 'purpose']
THRESHOLD = 0.5
LAMBDA_PARAM = 1
pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

In [4]:
# filter for only positve outcomes
positives_df = german_mit_df.loc[german_mit_df[OUTCOMES_COL] == 1]

# scan with positive direction
current_subset, current_score = run_ijdi_scan(positives_df, FEATURES, PROBA_CONFUSION_COL, PROBA_IJDI_COL, OUTCOMES_COL,
                                              THRESHOLD, LAMBDA_PARAM)
# summarize_scan(positives_df, FEATURES, PROBA_COL, OUTCOMES_COL, current_subset, include='all')\
current_subset, current_score

Subset found on iteration 1 of 10 with score 39.15179810704687 :
{'savings': ['Little'], 'housing': ['Free', 'Rent'], 'checking': ['Little', 'Moderate'], 'duration': ['Long', 'Very Long']}
Best score is now 39.15179810704687
Subset found on iteration 2 of 10 with score 9.719249060808034 :
{'credit_amt': ['High', 'Low', 'Very High'], 'housing': ['Free', 'Own'], 'purpose': ['Domestic appliances', 'Education', 'Repairs', 'Vacation/Others'], 'checking': ['Little', 'Moderate']}
Current score of 9.719249060808034 does not beat best score of 39.15179810704687
Subset found on iteration 3 of 10 with score 4.207090592953986 :
{'duration': ['Long'], 'sex': ['Male'], 'credit_amt': ['Very High']}
Current score of 4.207090592953986 does not beat best score of 39.15179810704687
Subset found on iteration 4 of 10 with score 6.139320292657476 :
{'duration': ['Long', 'Moderate', 'Very Long'], 'housing': ['Free'], 'credit_amt': ['High', 'Very High'], 'checking': ['Moderate']}
Current score of 6.1393202926

({'purpose': ['Business', 'Car', 'Domestic appliances', 'Education'],
  'savings': ['Little'],
  'checking': ['Little'],
  'job': ['Multiple Jobs', 'None'],
  'duration': ['Long', 'Very Long']},
 44.27684623229658)

### Implement Mitigation Approach 2

Set lambda vales to test.

In [5]:
lambda_vals = [0, 0.3, 1, 3, 10]

# specify parameters for generating metrics and IJDI scan
proba_confusion_col = 'proba_lr'
proba_ijdi_col = 'proba_rf'
outcomes_col = 'outcomes'
features = ['under_25', 'sex', 'job', 'housing', 'savings', 'checking',
            'credit_amt', 'duration', 'purpose']
threshold = 'threshold'

# define stopping criteria
stop_limit = 0
max_iters = 3


pd.options.mode.chained_assignment = None  # suppress warnings on chained assignment

# set random seed
np.random.seed(100)

In [6]:
sim_data = []

Implement approach for Positives

In [7]:
# filter for negative or positive outcomes
positives_df = german_mit_df.loc[german_mit_df[outcomes_col] == 1]

for lambda_param in lambda_vals: # run IJDI scan for various lambda values

    print("Lambda =", lambda_param)

    i = 0
    ijdi_present = True
    # set initial threshold
    positives_df['threshold'] = 0.5

    while ijdi_present and i <= max_iters:

        i += 1
        print("Iteration", i)

        # Run Negative or Positive IJDI Scan. Make sure to pass in copy because data may be modified by the function!
        current_subset, current_score = run_ijdi_scan(positives_df.copy(deep=True), features, proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                      threshold, lambda_param, constant_threshold=False, verbose=True)

        print("Score for Positives IJDI-Scan:", current_score)

        # append data
        row = [lambda_param, i, current_score, 'positive']
        sim_data.append(row)

        print(row)

        if current_score > stop_limit:

            # correct IJDI by setting new threshold for subset
            positives_df['threshold'] = correct_ijdi_subset(positives_df.copy(deep=True), features,
                                                            proba_confusion_col, proba_ijdi_col, outcomes_col,
                                                            threshold, lambda_param, current_subset)
            print(positives_df['threshold'])

        else:
            ijdi_present = False

        print("\n----------------------------------------------------\n")

Lambda = 0
Iteration 1
Subset found on iteration 1 of 10 with score 31.531333329465813 :
{'purpose': ['Business', 'Car', 'Domestic appliances', 'Education', 'Repairs', 'Vacation/Others'], 'savings': ['Little', 'Moderate'], 'checking': ['Little', 'Moderate'], 'duration': ['Long', 'Very Long']}
Best score is now 31.531333329465813
Subset found on iteration 2 of 10 with score 37.90588600604126 :
{'savings': ['Little'], 'checking': ['Little', 'Moderate'], 'duration': ['Long', 'Very Long'], 'housing': ['Free', 'Rent']}
Best score is now 37.90588600604126
Subset found on iteration 3 of 10 with score 37.90588600604126 :
{'duration': ['Long', 'Very Long'], 'savings': ['Little'], 'checking': ['Little', 'Moderate'], 'housing': ['Free', 'Rent']}
Current score of 37.90588600604126 does not beat best score of 37.90588600604126
Subset found on iteration 4 of 10 with score 37.90588600604126 :
{'housing': ['Free', 'Rent'], 'duration': ['Long', 'Very Long'], 'checking': ['Little', 'Moderate'], 'savings

In [8]:
columns = ['lambda', 'iteration', 'score', 'scan_type']
mit_result_df = pd.DataFrame(sim_data, columns=columns)
mit_result_df

Unnamed: 0,lambda,iteration,score,scan_type
0,0.0,1,37.905886,positive
1,0.0,2,23.094611,positive
2,0.0,3,20.511482,positive
3,0.0,4,19.777526,positive
4,0.0,5,16.176283,positive
...,...,...,...,...
82,10.0,5,0.492858,positive
83,10.0,6,13.415511,positive
84,10.0,7,0.873958,positive
85,10.0,8,1.851641,positive


In [9]:
# Write recipe outputs
mit_result_df.to_csv(home_dir + "datasets/german_credit_mit_2_pos.csv")