In [1]:
import numpy as np
import utils
import pandas as pd
from pathlib import Path
import os
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import matplotlib.pyplot as plt
font = {'family': 'normal',
        'weight': 'bold',
        'size': 25}

plt.rc('font', **font)
IMPUTATIONS = ["cca", "coldel", "mode", "mice"]

In [2]:
np.random.seed(1337)
synth_regular = utils.load_synthetic()
compas = utils.load_compas_alt()
RESPONSE = "two_year_recid"

In [3]:
def eo_sum(pred, prot, true):
    """
    Equation: |P(Y_pred = y_pred | Y_true = y_true, Z = 1) - P(Y_pred = y_pred | Y_true = y_true, Z = 0)|
    Assumes prot is 0/1 binary"""
    z1_y0 = [y_hat for y_hat, z, y in zip(
        pred, prot, true) if z == 1 and y == 0]
    z0_y0 = [y_hat for y_hat, z, y in zip(
        pred, prot, true) if z == 0 and y == 0]
    z1_y1 = [y_hat for y_hat, z, y in zip(
        pred, prot, true) if z == 1 and y == 1]
    z0_y1 = [y_hat for y_hat, z, y in zip(
        pred, prot, true) if z == 0 and y == 1]
    return abs(sum(z1_y1)/len(z1_y1)-sum(z0_y1)/len(z0_y1)) + abs(sum(z1_y0)/len(z1_y0)-sum(z0_y0)/len(z0_y0))

In [4]:
def sigmoid(x, alpha):
    z = np.exp(-x+alpha)
    sig = 1 / (1 + z)
    return sig

In [5]:
def confusion_matrix(true, pred):
    # Assumes numpy arrays(
    try:
        tpr = sum([1 if t == p and p == 1 else 0 for t,
                  p in zip(true, pred)])/(sum(true))
    except:
        tpr = 0
        #print("true", sum(true))
        #print("pred", sum(pred))

    try:
        tnr = sum([1 if t == p and p == 0 else 0 for t,
                  p in zip(true, pred)])/(len(true)-sum(true))
    except:
        tnr = 0
        #print("true", sum(true))
        #print("pred", sum(pred))
    fpr = 1-tnr
    fnr = 1-tpr
    #Old return structure. Converted to vanilla dict for json compatibility
    #return pd.DataFrame({"Predicted true": [tpr, fpr],
    #                     "Predicted false": [fnr, tnr]}, index=["Is true", "Is false"])
    return {"Predicted true": [tpr, fpr],
            "Predicted false": [fnr, tnr]}

In [31]:
def run(n_runs=200, dataset = "compas", missing="priors_count", sensitive = "gender_factor", response = "two_year_recid"):
    #TODO keep track of std as well for error bars
    results = {imp: {"eosum":[], "acc":[], "cm": [], "tprd":[], "tnrd": [], "tpr":[], "tnr": []} for imp in IMPUTATIONS}
    for run in tqdm(range(n_runs)):
        np.random.seed(run*13)
        if dataset == "compas":
            data = utils.load_compas_alt()
            miss_amounts = [1,2,3]
        elif dataset == "simple":
            data = utils.load_synthetic("simple")
            miss_amounts = [2,3,4]
        elif dataset == "adult":
            data = utils.load_adult()
            miss_amounts = [4,5,6,7]
        elif dataset== "synth":
            data = utils.load_synthetic("recid_alt")
            miss_amounts = [3,4]
        else:
            raise 
        for i in IMPUTATIONS:
            if i =="coldel" and (missing!=sensitive):
                train,test = data["train"].copy(), data["test"].copy()
                test[missing] = test[missing].apply(lambda x: 1 if x>0 else 0)
                train[missing] = train[missing].apply(lambda x: 1 if x>0 else 0)
                x_test = test.drop([response, missing],axis = 1)
                clf = LogisticRegression()
                clf.fit(train.drop([response, missing], axis = 1), train[response])
                    
                pred = clf.predict(x_test)
                y_test = test[response]
                cm = confusion_matrix(y_test, pred)
                results[i]["tpr"].append(cm["Predicted true"][0])
                results[i]["tnr"].append(cm["Predicted false"][1])
                results[i]["eosum"].append(eo_sum(pred, x_test[sensitive], y_test))
                results[i]["acc"].append(accuracy_score(y_test, pred))
                
                for s in [0,1]:
                    y_test = test[test[sensitive]==s]
                    y_test = y_test[response]
                    pred = clf.predict(x_test[x_test[sensitive]==s])
                    if s==0:
                        cm0 = confusion_matrix(y_test, pred)
                    elif s==1:      
                        cm1 = confusion_matrix(y_test, pred)
                    else:
                        print("ERROR CM")
                results[i]["tprd"].append(abs(cm1["Predicted true"][0]-cm0["Predicted true"][0]))
                results[i]["tnrd"].append(abs(cm1["Predicted false"][1]-cm0["Predicted false"][1]))
                continue
            
            tempeo = []
            tempacc = []
            tempcm = []
            temptprd = []
            temptnrd = []
            temptpr = []
            temptnr = []
            for miss in miss_amounts:
                #print(miss)
                train,test = data["train"].copy(), data["test"].copy()
                test[missing] = test[missing].apply(lambda x: 1 if x>0 else 0)
                train[missing] = train[missing].apply(lambda x: 1 if x>0 else 0)
                
                train["miss"] =np.around(sigmoid(train.drop(missing, axis = 1).sum(axis = 1), miss)).astype(int)
                #print("MISSING", train["miss"].sum())
                if i =="cca":
                    train = train[train["miss"]==0]
                elif i =="mode":
                    miss_index = train[train["miss"]==1].index
                    train[missing] = train[missing].mask(train.index.isin(miss_index),
                                                                            other=np.nan)
                    train.fillna(train[missing].mode(dropna=True)[0], inplace=True)
                    #train.drop("miss", axis = 1, inplace = True)
                elif i =="mice":
                    miss_index = train[train["miss"]==1].index
                    train[missing] = train[missing].mask(train.index.isin(miss_index),
                                                                            other=np.nan)
                    
                    imputer = IterativeImputer(random_state=0)
                    imputer.fit(train)
                    train = pd.DataFrame(imputer.transform(train), columns=train.columns)
                    train[missing] = train[missing].round()
                else:
                    print("ERROR")
                train.drop("miss", axis = 1, inplace = True)
                x_test = test.drop(response,axis = 1)
                clf = LogisticRegression()
                clf.fit(train.drop(response, axis = 1), train[response])
                    
                pred = clf.predict(x_test)
                y_test = test[response]
                cm = confusion_matrix(y_test, pred)
                temptpr.append(cm["Predicted true"][0])
                temptnr.append(cm["Predicted false"][1])
                tempacc.append(accuracy_score(y_test, pred))
                tempeo.append(eo_sum(pred, x_test[sensitive], y_test))
                for s in [0,1]:
                    y_test = test[test[sensitive]==s]
                    y_test = y_test[response]
                    pred = clf.predict(x_test[x_test[sensitive]==s])
                    if s==0:
                        cm0 = confusion_matrix(y_test, pred)
                    elif s==1:      
                        cm1 = confusion_matrix(y_test, pred)
                    else:
                        print("ERROR CM")
                temptprd.append(abs(cm1["Predicted true"][0]-cm0["Predicted true"][0]))
                temptnrd.append(abs(cm1["Predicted false"][1]-cm0["Predicted false"][1]))
            results[i]["tprd"].append(temptprd)
            results[i]["tnrd"].append(temptnrd)
            results[i]["tpr"].append(temptpr)
            results[i]["tnr"].append(temptnr)
            results[i]["eosum"].append(tempeo)
            results[i]["acc"].append(tempacc)
            results[i]["tpr"].append(temptpr)
            results[i]["tnr"].append(temptnr)
    for imp in IMPUTATIONS:
        for metr in ["eosum", "acc", "tpr", "tnr", "tnrd", "tprd"]:
            results[imp][metr] = np.mean(results[imp][metr],axis = 0)
    return results

In [15]:
def basic_plot(results, dataset = "compas",missing="priors_count", sensitive = "gender_factor" ):
    savepath = "temp/temp_final/"+dataset+"/"+missing +"_"+sensitive+"/"
    if not os.path.isdir(Path(savepath)):
        os.mkdir(Path(savepath))
    for metr in ["eosum", "acc", "tpr", "tnr", "tprd", "tnrd"]:
        fig = plt.gcf()
        fig.set_size_inches(22.5, 12.5)
        for imp in IMPUTATIONS:
            #print(imp)
            if imp =="coldel":
                plt.plot([a for a in range(len(results["cca"][metr]))], [results[imp][metr]]*len(results["cca"][metr]), label = imp)
            else:
                plt.plot([a for a in range(len(results["cca"][metr]))], results[imp][metr][::-1], label = imp)
        plt.legend()
        plt.title(metr)
        plt.savefig(Path(savepath+metr+".png"))
        plt.clf()

In [8]:
basic = run(n_runs = 200, dataset="compas", missing = "priors_count", sensitive = "is_Caucasian")
basic_plot(basic, dataset = "compas", missing = "priors_count", sensitive = "is_Caucasian")

100%|██████████| 200/200 [01:16<00:00,  2.61it/s]
findfont: Font family ['normal'] not found. Falling back to DejaVu Sans.
findfont: Font family ['normal'] not found. Falling back to DejaVu Sans.


<Figure size 1620x900 with 0 Axes>

In [9]:
basic = run(n_runs = 200, missing = "crime_factor", sensitive = "is_Caucasian")
basic_plot(basic, dataset = "compas", missing = "crime_factor", sensitive = "is_Caucasian")

100%|██████████| 200/200 [01:30<00:00,  2.20it/s]


<Figure size 1620x900 with 0 Axes>

In [10]:
basic = run(n_runs = 200, missing = "priors_count", sensitive = "gender_factor")
basic_plot(basic, dataset = "compas", missing = "priors_count", sensitive = "gender_factor")

100%|██████████| 200/200 [01:21<00:00,  2.45it/s]


<Figure size 1620x900 with 0 Axes>

In [11]:
basic = run(n_runs = 200, missing = "crime_factor", sensitive = "gender_factor")
basic_plot(basic, dataset = "compas", missing = "crime_factor", sensitive = "gender_factor")

100%|██████████| 200/200 [01:35<00:00,  2.09it/s]


<Figure size 1620x900 with 0 Axes>

In [12]:
"""basic = run(n_runs = 200, missing = "gender_factor", sensitive = "gender_factor")
basic_plot(basic, dataset = "compas", missing = "gender_factor", sensitive = "gender_factor")"""

'basic = run(n_runs = 200, missing = "gender_factor", sensitive = "gender_factor")\nbasic_plot(basic, dataset = "compas", missing = "gender_factor", sensitive = "gender_factor")'

In [21]:
basic = run(n_runs = 200,dataset = "simple",  missing = "x_2", sensitive = "x_1", response = "y")
basic_plot(basic, dataset = "simple",  missing = "x_2", sensitive = "x_1")

  3%|▎         | 6/200 [00:04<02:37,  1.23it/s]


KeyboardInterrupt: 

In [16]:
#basic_plot(basic, dataset = "simple",  missing = "x_2", sensitive = "x_1")

<Figure size 1620x900 with 0 Axes>

In [24]:
basic = run(n_runs = 200,dataset = "simple",  missing = "x_5", sensitive = "x_1", response = "y")
basic_plot(basic, dataset = "simple",  missing = "x_5", sensitive = "x_1")

100%|██████████| 200/200 [01:48<00:00,  1.85it/s]


<Figure size 1620x900 with 0 Axes>

In [32]:
basic = run(n_runs = 200, dataset="synth", missing = "priors_count", sensitive = "is_Caucasian")
basic_plot(basic, dataset = "synth", missing = "priors_count", sensitive = "is_Caucasian")

100%|██████████| 200/200 [00:54<00:00,  3.69it/s]


<Figure size 1620x900 with 0 Axes>

In [33]:
basic = run(n_runs = 200, dataset="synth", missing = "priors_count", sensitive = "gender_factor")
basic_plot(basic, dataset = "synth", missing = "priors_count", sensitive = "gender_factor")

100%|██████████| 200/200 [00:57<00:00,  3.46it/s]


<Figure size 1620x900 with 0 Axes>