In [2]:
from fair_logistic_reg import FairLogisticRegression
import numpy as np
import utils
import pandas as pd
from pathlib import Path
import os
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import matplotlib.pyplot as plt
font = {'family': 'normal',
        'weight': 'bold',
        'size': 25}

plt.rc('font', **font)
IMPUTATIONS = ["cca", "coldel", "mode", "mice"]

In [3]:
np.random.seed(1337)
compas = utils.load_compas_alt()
train, test = compas["train"], compas["test"]
RESPONSE = "two_year_recid"

In [4]:
def eo_sum(pred, prot, true):
    """
    Equation: |P(Y_pred = y_pred | Y_true = y_true, Z = 1) - P(Y_pred = y_pred | Y_true = y_true, Z = 0)|
    Assumes prot is 0/1 binary"""
    z1_y0 = [y_hat for y_hat, z, y in zip(
        pred, prot, true) if z == 1 and y == 0]
    z0_y0 = [y_hat for y_hat, z, y in zip(
        pred, prot, true) if z == 0 and y == 0]
    z1_y1 = [y_hat for y_hat, z, y in zip(
        pred, prot, true) if z == 1 and y == 1]
    z0_y1 = [y_hat for y_hat, z, y in zip(
        pred, prot, true) if z == 0 and y == 1]
    return abs(sum(z1_y1)/len(z1_y1)-sum(z0_y1)/len(z0_y1)) + abs(sum(z1_y0)/len(z1_y0)-sum(z0_y0)/len(z0_y0))

In [5]:
def sigmoid(x, alpha):
    z = np.exp(-x+alpha)
    sig = 1 / (1 + z)
    return sig

In [6]:
def confusion_matrix(true, pred):
    # Assumes numpy arrays(
    try:
        tpr = sum([1 if t == p and p == 1 else 0 for t,
                  p in zip(true, pred)])/(sum(true))
    except:
        tpr = 0
        #print("true", sum(true))
        #print("pred", sum(pred))

    try:
        tnr = sum([1 if t == p and p == 0 else 0 for t,
                  p in zip(true, pred)])/(len(true)-sum(true))
    except:
        tnr = 0
        #print("true", sum(true))
        #print("pred", sum(pred))
    fpr = 1-tnr
    fnr = 1-tpr
    #Old return structure. Converted to vanilla dict for json compatibility
    #return pd.DataFrame({"Predicted true": [tpr, fpr],
    #                     "Predicted false": [fnr, tnr]}, index=["Is true", "Is false"])
    return {"Predicted true": [tpr, fpr],
            "Predicted false": [fnr, tnr]}

In [7]:
np.random.seed(0)
compas = utils.load_compas_alt()
train, test = compas["train"].copy(), compas["test"].copy()
test["priors_count"] = test["priors_count"].apply(lambda x: 1 if x>0 else 0)
train["priors_count"] = train["priors_count"].apply(lambda x: 1 if x>0 else 0)
train["miss"] =np.around(sigmoid(train["crime_factor"] +train["gender_factor"] + train["is_Caucasian"] + train["age_factor_Greater than 45"] + train["age_factor_Less than 25"] + train["two_year_recid"], 1)).astype(int)
#x_miss = train[train["priors_count"].isnull()].drop("priors_count",axis = 1)
#log_reg = LogisticRegression()
from custom_log_reg import CustomLogisticRegression
"""log_reg = CustomLogisticRegression()
log_reg.fit(train.drop(RESPONSE, axis = 1), train[RESPONSE])

flr_orig = FairLogisticRegression(lam = 0.0)
flr_orig.pre_fit(train.drop(RESPONSE, axis = 1), train[RESPONSE])
flr_mod = FairLogisticRegression(model = log_reg, fairness_metric="eo_sum")
flr_mod.fit_predicitve(train.drop(RESPONSE, axis = 1), train[RESPONSE])
flr_lam1 = FairLogisticRegression(model = log_reg, fairness_metric="eo_sum", lam=1.0)
flr_lam1.fit_predicitve(train.drop(RESPONSE, axis = 1), train[RESPONSE])"""


'log_reg = CustomLogisticRegression()\nlog_reg.fit(train.drop(RESPONSE, axis = 1), train[RESPONSE])\n\nflr_orig = FairLogisticRegression(lam = 0.0)\nflr_orig.pre_fit(train.drop(RESPONSE, axis = 1), train[RESPONSE])\nflr_mod = FairLogisticRegression(model = log_reg, fairness_metric="eo_sum")\nflr_mod.fit_predicitve(train.drop(RESPONSE, axis = 1), train[RESPONSE])\nflr_lam1 = FairLogisticRegression(model = log_reg, fairness_metric="eo_sum", lam=1.0)\nflr_lam1.fit_predicitve(train.drop(RESPONSE, axis = 1), train[RESPONSE])'

In [8]:
temp = train[train["miss"] == 0].copy()
temp.drop("miss", axis = 1, inplace = True)

In [9]:
"aaa".split("_")

['aaa']

In [10]:
temp.head()

Unnamed: 0,priors_count,crime_factor,gender_factor,is_Caucasian,age_factor_Greater than 45,age_factor_Less than 25,two_year_recid
3362,1,0,1,0,0,0,0
3993,0,0,0,0,0,0,1
4060,1,0,0,0,0,0,0
2549,0,0,0,0,0,0,0
4808,1,0,1,0,0,0,0


In [12]:
for l in [0.5, 0.8, 0.95, 0.99, 1.0]:
    flr_orig = FairLogisticRegression(lam = l)
    flr_orig.pre_fit(temp.drop("priors_count", axis = 1), temp["priors_count"], epochs = 300)
    flr_orig.fit_predicitve(train.drop(RESPONSE, axis = 1), train[RESPONSE], epochs = 100)
    print("STARTING WEIGHTS", flr_orig.weights)
    print("STARTING FRACTION OF TRUE PRED", np.sum(flr_orig.predict(temp.drop("priors_count", axis = 1)))/len(temp))
    flr_orig.fit(temp.drop("priors_count", axis = 1), temp["priors_count"], temp[RESPONSE], temp["gender_factor"], 200, 
                        data = temp.drop([RESPONSE, "priors_count"], axis = 1), missing = "priors_count")
    print("ENDING WEIGHTS", flr_orig.weights)
    print("ENDING FRACTION OF TRUE PRED", np.sum(flr_orig.predict(temp.drop("priors_count", axis = 1)))/len(temp))

STARTING WEIGHTS [-2.13635375e+02  2.19318332e+03 -1.99782588e+00  3.49126486e+00
 -8.65807904e+01 -4.20432332e+02]
STARTING FRACTION OF TRUE PRED 0.8073047858942065
ENDING WEIGHTS [-298.37977811 2654.67481997  178.27420487  176.59404177  177.49426371
 -615.78949471]
ENDING FRACTION OF TRUE PRED 0.8400503778337531
STARTING WEIGHTS [-2.13635375e+02  2.19318332e+03 -1.99782588e+00  3.49126486e+00
 -8.65807904e+01 -4.20432332e+02]
STARTING FRACTION OF TRUE PRED 0.8073047858942065
ENDING WEIGHTS [ -61.64290455 2369.34183066  232.91815812  234.21257079  230.88655035
 -500.30348047]
ENDING FRACTION OF TRUE PRED 0.8400503778337531
STARTING WEIGHTS [-2.13635375e+02  2.19318332e+03 -1.99782588e+00  3.49126486e+00
 -8.65807904e+01 -4.20432332e+02]
STARTING FRACTION OF TRUE PRED 0.8073047858942065
ENDING WEIGHTS [ 104.57621174 2234.20508266  215.20404395  214.28417999  213.36729563
 -441.01823608]
ENDING FRACTION OF TRUE PRED 0.871536523929471
STARTING WEIGHTS [-2.13635375e+02  2.19318332e+03 -1.

In [None]:
#log_reg.weights

In [None]:
#flr_orig.weights

In [None]:
#print(log_reg.predict(train.drop(RESPONSE, axis = 1)))

In [None]:
temp = train[train["miss"] == 0].copy()
temp.drop("miss", axis = 1)
print(len(temp)/len(train))

0.19201934703748488


In [None]:
#flr_orig.fit(temp.drop("priors_count", axis = 1), temp["priors_count"], temp["gender_factor"], 50)

In [None]:
#flr_lam1._sigmoid(temp.drop("priors_count", axis = 1))

In [None]:
#np.sum(flr_lam1.predict(temp.drop("priors_count", axis = 1)))/len(temp)

In [None]:
"""x_miss = data[data[missing_col].isnull()].drop(missing_col,axis = 1)
y_hat = flr.predict(x_miss)
data.loc[data[missing_col].isnull(),missing_col] = y_hat """

'x_miss = data[data[missing_col].isnull()].drop(missing_col,axis = 1)\ny_hat = flr.predict(x_miss)\ndata.loc[data[missing_col].isnull(),missing_col] = y_hat '

In [None]:
def custom_fair( pred, prot):
    z_1 = [y_hat for y_hat, z in zip(pred, prot) if z==1]
    z_0 = np.array([y_hat for y_hat, z in zip(pred, prot) if z==0])
    print(z_1)
    print("CUSTOM FAIR", np.sum([np.sum((z1 - z_0)**2) for z1 in z_1]))
    return np.sum([np.sum((z1 - z_0)**2) for z1 in z_1])

pred = [0.2,0.5,0.7, 0.5,0.5]
prot = [1,0,1,0,0]
custom_fair(pred,prot)

[0.2, 0.7]
CUSTOM FAIR 0.38999999999999996


0.38999999999999996

In [None]:
#flr_lam1.weights

In [None]:
flr_lam1.fit(temp.drop("priors_count", axis = 1), temp["priors_count"], temp["gender_factor"], 1000, 
                       data = temp.drop([RESPONSE, "priors_count"], axis = 1), missing = "priors_count")

flr_mod.fit(temp.drop("priors_count", axis = 1), temp["priors_count"], temp["gender_factor"], 100, 
                       data = temp.drop([RESPONSE, "priors_count"], axis = 1), missing = "priors_count")


NameError: name 'flr_lam1' is not defined

In [None]:
y_hat_orig = flr_orig.predict(x_miss)
y_hat_mod = flr_mod.predict(x_miss)
orig = train.copy()
mod = train.copy()
mod.loc[mod["priors_count"].isnull(),"priors_count"] = y_hat_mod
mod.drop("miss", axis = 1, inplace = True)
x_test = test.drop(RESPONSE,axis = 1)
clf = LogisticRegression()
clf.fit(mod.drop(RESPONSE, axis = 1), mod[RESPONSE])
for s in [0,1]:
    y_test = test[test["gender_factor"]==s]
    y_test = y_test[RESPONSE].astype(int)
    pred = clf.predict(x_test[x_test["gender_factor"]==s])
    print(pred.sum()-len(pred))
    print("z=1", confusion_matrix(y_test, pred))
    print("accuracy", accuracy_score(y_test, pred))
orig.loc[orig["priors_count"].isnull(),"priors_count"] = y_hat_orig


orig.drop("miss", axis = 1, inplace = True)
x_test = test.drop(RESPONSE,axis = 1)
clf = LogisticRegression()
clf.fit(orig.drop(RESPONSE, axis = 1), orig[RESPONSE])
for s in [0,1]:
    y_test = test[test["gender_factor"]==s]
    y_test = y_test[RESPONSE].astype(int)
    pred = clf.predict(x_test[x_test["gender_factor"]==s])
    print(pred.sum()-len(pred))
    print("z=1", confusion_matrix(y_test, pred))
    print("accuracy", accuracy_score(y_test, pred))



NameError: name 'x_miss' is not defined