In [1]:
import numpy as np
import utils
import pandas as pd

In [2]:
synth_regular = utils.load_synthetic()
compas = utils.load_compas()
RESPONSE = "score_text"

In [3]:
synth_regular["train"].keys()

Index(['score_text', 'length_of_stay', 'two_year_recid', 'is_Caucasian',
       'c_charge_degree_M', 'age_greater_than_45', 'age_less_than_25',
       'sex_Male'],
      dtype='object')

In [4]:
compas["standard"]["train"].keys()

Index(['two_year_recid', 'length_of_stay', 'is_Caucasian', 'c_charge_degree_M',
       'age_cat_Greater than 45', 'age_cat_Less than 25', 'sex_Male',
       'score_text'],
      dtype='object')

In [5]:
#Checking that the classes don't become too much more imbalanced when data is removed.

#Synthetic data
for key, value in synth_regular.items():
    #print(value.columns)
    print("Key: ", key)
    print("Caucasian = 1: ",len(value[value["is_Caucasian"]==1])/len(value) ,
    "\nCaucasian = 0: ", len(value[value["is_Caucasian"]==0])/len(value))
    print("Male = 1: ",len(value[value["sex_Male"]==1])/len(value) ,
    "\nMale = 0: ", len(value[value["sex_Male"]==0])/len(value))
    print("After 30% missing: ")
    temp = value.copy()
    temp = utils.impute(utils.data_remover_cat(temp, "is_Caucasian", 30), "is_Caucasian")
    print("Caucasian = 1: ",len(temp[temp["is_Caucasian"]==1])/len(temp) ,
        "\nCaucasian = 0: ", len(temp[temp["is_Caucasian"]==0])/len(temp))
    print("Male = 1: ",len(temp[temp["sex_Male"]==1])/len(temp) ,
        "\nMale = 0: ", len(temp[temp["sex_Male"]==0])/len(temp))

Key:  test
Caucasian = 1:  0.3408408408408408 
Caucasian = 0:  0.6591591591591591
Male = 1:  0.8203203203203203 
Male = 0:  0.17967967967967968
After 30% missing: 
Caucasian = 1:  0.3333333333333333 
Caucasian = 0:  0.6666666666666666
Male = 1:  0.8454935622317596 
Male = 0:  0.15450643776824036
Key:  train
Caucasian = 1:  0.34182908545727136 
Caucasian = 0:  0.6581709145427287
Male = 1:  0.8213393303348325 
Male = 0:  0.17866066966516742
After 30% missing: 
Caucasian = 1:  0.33785714285714286 
Caucasian = 0:  0.6621428571428571
Male = 1:  0.8203571428571429 
Male = 0:  0.17964285714285713


In [6]:
#Compas data
for ver in ["standard", "violent"]:
    print("Compas dataset type: ", ver)
    for key, value in compas[ver].items():
        #print(value.columns)
        print("Key: ", key)
        print("Caucasian = 1: ",len(value[value["is_Caucasian"]==1])/len(value) ,
        "\nCaucasian = 0: ", len(value[value["is_Caucasian"]==0])/len(value))
        print("Male = 1: ",len(value[value["sex_Male"]==1])/len(value) ,
        "\nMale = 0: ", len(value[value["sex_Male"]==0])/len(value))
        print("Score text: ", len(value[value["score_text"]==1])/len(value))
        print("After 30% missing: ")
        temp = value.copy()
        temp = utils.impute(utils.data_remover_cat(temp, "is_Caucasian", 30), "is_Caucasian")
        print("Caucasian = 1: ",len(temp[temp["is_Caucasian"]==1])/len(temp) ,
            "\nCaucasian = 0: ", len(temp[temp["is_Caucasian"]==0])/len(temp))
        print("Male = 1: ",len(temp[temp["sex_Male"]==1])/len(temp) ,
            "\nMale = 0: ", len(temp[temp["sex_Male"]==0])/len(temp))
        print("Score text: ", len(value[value["score_text"]==1])/len(value))


Compas dataset type:  standard
Key:  train
Caucasian = 1:  0.3388149939540508 
Caucasian = 0:  0.6611850060459492
Male = 1:  0.8099153567110037 
Male = 0:  0.19008464328899638
Score text:  0.8169286577992745
After 30% missing: 
Caucasian = 1:  0.32894736842105265 
Caucasian = 0:  0.6710526315789473
Male = 1:  0.7932825484764543 
Male = 0:  0.2067174515235457
Score text:  0.8169286577992745
Key:  test
Caucasian = 1:  0.3446244477172312 
Caucasian = 0:  0.6553755522827688
Male = 1:  0.8090328915071183 
Male = 0:  0.19096710849288168
Score text:  0.8100147275405007
After 30% missing: 
Caucasian = 1:  0.3368421052631579 
Caucasian = 0:  0.6631578947368421
Male = 1:  0.7922807017543859 
Male = 0:  0.20771929824561403
Score text:  0.8100147275405007
Compas dataset type:  violent
Key:  train
Caucasian = 1:  0.36390642406238394 
Caucasian = 0:  0.6360935759376161
Male = 1:  0.7875974749350168 
Male = 0:  0.21240252506498328
Score text:  0.9324173783884144
After 30% missing: 
Caucasian = 1:  0.

In [7]:
percentiles = [p for p in range(0,20,2)]+[p for p in range(20,50, 10)]
missing=["two_year_recid"]#, "is_Caucasian", "sex_Male",	"c_charge_degree_M"]
all_results = {}
for miss in missing:
    for sensitive in ["is_Caucasian"]:#, "sex_Male"]:
        synth_results = utils.test_bench(train = synth_regular["train"],test = synth_regular["test"], pred = RESPONSE, missing = miss, sensitive=sensitive,
                        percentiles = percentiles)
        recid_results = utils.test_bench(train = compas["standard"]["train"],test = compas["standard"]["test"], pred = RESPONSE, missing = miss, sensitive=sensitive,
                        percentiles = percentiles)
        v_recid_results = utils.test_bench(train = compas["standard"]["train"],test = compas["standard"]["test"], pred = RESPONSE, missing = miss, sensitive=sensitive,
                        percentiles = percentiles)
        all_results[miss+"_"+sensitive+"_"+"synth"] = synth_results 
        all_results[miss+"_"+sensitive+"_"+"recid"] = recid_results 
        all_results[miss+"_"+sensitive+"_"+"v_recid"] = v_recid_results 
        

100%|██████████| 4/4 [01:30<00:00, 22.50s/it]
100%|██████████| 4/4 [01:56<00:00, 29.08s/it]
100%|██████████| 4/4 [02:04<00:00, 31.22s/it]


In [8]:
import json
from pathlib import Path
with open(Path("raw_data/first.json"), 'w') as f:
            json.dump(all_results, f)

In [9]:
for key, res in all_results.items():
    utils.plotting_cf(["log_reg", "rf_cat", "svm", "knn"],["cca", "mice_def", "mean"], res, key+"/")
    utils.plotting_others(res, key+"/")

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'experiments'