In [131]:
"""
Created on: Fri. 14 Oct. 2022
Updated on: Wed. 2 Nov. 2022
Author: Mélina Verger

Manually compute a distance between the distributions of the same feature.

>> Compute the intersection area (and the IoU ratio) and the normalized Euclidean distance between the 2 kde estimates curves.
"""

# To exit script
from sys import exit

# To load the trained models
import pickle

# For data manipulation
import pandas as pd
import numpy as np

# To print with tabular format
from tabulate import tabulate

## Loading

In [132]:
DATA = pickle.load(open("../data/DATA", "rb"))
SPLIT = pickle.load(open("../data/SPLIT", "rb"))
SFEATURES = pickle.load(open("../data/SFEATURES", "rb"))

print(DATA)
print(SPLIT)
print(SFEATURES)

stInfo
7030
['gender', 'imd_band', 'disability']


In [133]:
# Load test sets

X_test = pd.read_csv("../data/X_test" + "_" + DATA + "_" + SPLIT + ".csv")
y_test = pd.read_csv("../data/y_test" + "_" + DATA + "_" + SPLIT + ".csv")

In [134]:
# Load trained models

clf_lr = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_lr", "rb"))
# clf_svc = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_svc", "rb"))
clf_knearest = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_knearest", "rb"))
clf_dt = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_dt", "rb"))
clf_rf = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_rf", "rb"))

In [135]:
models = [clf_lr, clf_knearest, clf_dt, clf_rf]  # except svc

## Separate data sets by (un-)protected groups

/!\ The following only works for binary sensitive features.

In [136]:
dict_subsets_test = dict()

for sensfeat in SFEATURES:
    # X_test_{sensitive feature and group 1/0}
    dict_subsets_test["X"+ "_test_" + sensfeat + "_"+ "1"] = X_test[X_test[sensfeat] == 1]
    dict_subsets_test["X"+ "_test_" + sensfeat + "_"+ "0"] = X_test[X_test[sensfeat] == 0]
    # y_test_{sensitive feature and group 1/0}
    dict_subsets_test["y"+ "_test_" + sensfeat + "_"+ "1"] = y_test.loc[dict_subsets_test["X" + "_test_" + sensfeat + "_" + "1"].index]
    dict_subsets_test["y"+ "_test_" + sensfeat + "_"+ "0"] = y_test.loc[dict_subsets_test["X" + "_test_" + sensfeat + "_" + "0"].index]

## Prediction **probabilities**

In [137]:
dict_subsets_PP = dict()

for model in models:  # except svc model because no probability outputs 

    if model == clf_lr:
        modl = "lr"
    elif model == clf_knearest:
        modl = "kn"
    elif model == clf_dt:
        modl = "dt" 
    elif model == clf_rf:
        modl = "rf"
    else:
        print("Invalid model.")
        exit()

    # y_PP for X_test_{sensitive feature and group 1/0}
    for sensfeat in SFEATURES:
        dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "1"] = model.predict_proba(dict_subsets_test["X"+ "_test_" + sensfeat + "_"+ "1"])[:, 1]  # [:, 1] because propa of being in the class 1
        dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "0"] = model.predict_proba(dict_subsets_test["X"+ "_test_" + sensfeat + "_"+ "0"])[:, 0]  # [:, 1] because propa of being in the class 0

## UT

In [138]:
results = list()

for sensfeat in SFEATURES:

    for model in models:

        if model == clf_lr:
            modl = "lr"
        elif model == clf_knearest:
            modl = "kn"  # model that generates FutureWarning
        elif model == clf_dt:
            modl = "dt" 
        elif model == clf_rf:
            modl = "rf"
        else:
            print("Invalid model.")
            exit()
        
        subres = list()
        moy1 = np.mean(dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "1"])
        moy0 = np.mean(dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "0"])
        subres.append(sensfeat)
        subres.append(modl)
        subres.append(round(moy1, 2))
        subres.append(round(moy0, 2))
        subres.append(round(abs(moy1 - moy0), 2))
        results.append(subres)

print(tabulate(results, headers=["Sensitive feature", "Model", "UT group 1", "UT group 0", "Delta"]))

Sensitive feature    Model      UT group 1    UT group 0    Delta
-------------------  -------  ------------  ------------  -------
gender               lr               0.62          0.36     0.26
gender               kn               0.63          0.37     0.26
gender               dt               0.65          0.36     0.28
gender               rf               0.63          0.36     0.27
imd_band             lr               0.58          0.28     0.3
imd_band             kn               0.54          0.23     0.31
imd_band             dt               0.58          0.27     0.31
imd_band             rf               0.58          0.27     0.31
disability           lr               0.59          0.36     0.23
disability           kn               0.67          0.38     0.29
disability           dt               0.59          0.36     0.23
disability           rf               0.6           0.36     0.24


## Mean density error ($\overline{d}$)

/!\ The vectors does not have the same length but it is handled by the density vector.

In [139]:
def normalized_density_vector(pred_proba_array):

    PP_rounded = np.around(pred_proba_array, decimals=2)

    density_vector = np.zeros(101)  # empty
    proba_values = np.linspace(0, 1, 101)  # 101 elems

    for i in range(len(proba_values)):
        compar = proba_values[i]
        count = 0
        for x in PP_rounded:
            if x == compar:
                count = count + 1
        density_vector[i] = count
    
    normalized_density_vec = density_vector / np.sum(density_vector)

    return normalized_density_vec

In [140]:
def mean_density_error(norm_densvect_1, norm_densvect_0):
    return np.absolute(norm_densvect_1 - norm_densvect_0).sum()

In [141]:
d_res = list()

for sensfeat in SFEATURES:

    for model in models:

        if model == clf_lr:
            modl = "lr"
        elif model == clf_knearest:
            modl = "kn"  # model that generates FutureWarning
        elif model == clf_dt:
            modl = "dt" 
        elif model == clf_rf:
            modl = "rf"
        else:
            print("Invalid model.")
            exit()
    
        d_subres = list()
        d_subres.append(sensfeat)
        d_subres.append(modl)
        norm_densvect1 = normalized_density_vector(dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "1"])
        norm_densvect0 = normalized_density_vector(dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "0"])
        d = round(mean_density_error(norm_densvect1, norm_densvect0), 2)
        d_subres.append(d)
        d_res.append(d_subres)

print(tabulate(d_res, headers=["Sensitive feature", "Model", "d"]))

Sensitive feature    Model       d
-------------------  -------  ----
gender               lr       1.73
gender               kn       0.92
gender               dt       1.58
gender               rf       1.72
imd_band             lr       1.88
imd_band             kn       1.11
imd_band             dt       1.62
imd_band             rf       1.73
disability           lr       1.76
disability           kn       0.94
disability           dt       1.52
disability           rf       1.56


## Manually compute the "distance vector"

In [142]:
def dense_values(vector):
    c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10 = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

    for x in vector:
        if x == 0.0:
            c0 = c0 + 1
        elif x == 0.1:
            c1 = c1 + 1
        elif x == 0.2:
            c2 = c2 + 1
        elif x == 0.3:
            c3 = c3 + 1
        elif x == 0.4:
            c4 = c4 + 1
        elif x == 0.5:
            c5 = c5 + 1
        elif x == 0.6:
            c6 = c6 + 1
        elif x == 0.7:
            c7 = c7 + 1
        elif x == 0.8:
            c8 = c8 + 1
        elif x == 0.9:
            c9 = c9 + 1
        else:
            c10 = c10 + 1
    
    v = np.array([c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10])
    v_normalized = v / np.sum(v)

    return v_normalized

In [143]:
X_test_gender_1 = X_test[X_test["gender"] == 1]
X_test_gender_0 = X_test[X_test["gender"] == 0]

y_test_gender_1 = y_test.loc[X_test_gender_1.index]
y_test_gender_0 = y_test.loc[X_test_gender_0.index]

In [144]:
X_test_imdband_1 = X_test[X_test["imd_band"] == 1]
X_test_imdband_0 = X_test[X_test["imd_band"] == 0]

y_test_imdband_1 = y_test.loc[X_test_imdband_1.index]
y_test_imdband_0 = y_test.loc[X_test_imdband_0.index]

In [145]:
X_test_disability_1 = X_test[X_test["disability"] == 1]
X_test_disability_0 = X_test[X_test["disability"] == 0]

y_test_disability_1 = y_test.loc[X_test_disability_1.index]
y_test_disability_0 = y_test.loc[X_test_disability_0.index]

In [146]:
# Make predictions with each model


In [147]:
lr_y_pred_proba_gender_1 = clf_lr.predict_proba(X_test_gender_1)[:, 1]
kn_y_pred_proba_gender_1 = clf_knearest.predict_proba(X_test_gender_1)[:, 1]
dt_y_pred_proba_gender_1 = clf_dt.predict_proba(X_test_gender_1)[:, 1]
rf_y_pred_proba_gender_1 = clf_rf.predict_proba(X_test_gender_1)[:, 1]

lr_y_pred_proba_gender_0 = clf_lr.predict_proba(X_test_gender_0)[:, 0]
kn_y_pred_proba_gender_0 = clf_knearest.predict_proba(X_test_gender_0)[:, 0]
dt_y_pred_proba_gender_0 = clf_dt.predict_proba(X_test_gender_0)[:, 0]
rf_y_pred_proba_gender_0 = clf_rf.predict_proba(X_test_gender_0)[:, 0]

In [148]:
lr_y_pred_proba_imdband_1 = clf_lr.predict_proba(X_test_imdband_1)[:, 1]
kn_y_pred_proba_imdband_1 = clf_knearest.predict_proba(X_test_imdband_1)[:, 1]
dt_y_pred_proba_imdband_1 = clf_dt.predict_proba(X_test_imdband_1)[:, 1]
rf_y_pred_proba_imdband_1 = clf_rf.predict_proba(X_test_imdband_1)[:, 1]

lr_y_pred_proba_imdband_0 = clf_lr.predict_proba(X_test_imdband_0)[:, 0]
kn_y_pred_proba_imdband_0 = clf_knearest.predict_proba(X_test_imdband_0)[:, 0]
dt_y_pred_proba_imdband_0 = clf_dt.predict_proba(X_test_imdband_0)[:, 0]
rf_y_pred_proba_imdband_0 = clf_rf.predict_proba(X_test_imdband_0)[:, 0]

In [149]:
lr_y_pred_proba_disability_1 = clf_lr.predict_proba(X_test_disability_1)[:, 1]
kn_y_pred_proba_disability_1 = clf_knearest.predict_proba(X_test_disability_1)[:, 1]
dt_y_pred_proba_disability_1 = clf_dt.predict_proba(X_test_disability_1)[:, 1]
rf_y_pred_proba_disability_1 = clf_rf.predict_proba(X_test_disability_1)[:, 1]

lr_y_pred_proba_disability_0 = clf_lr.predict_proba(X_test_disability_0)[:, 0]
kn_y_pred_proba_disability_0 = clf_knearest.predict_proba(X_test_disability_0)[:, 0]
dt_y_pred_proba_disability_0 = clf_dt.predict_proba(X_test_disability_0)[:, 0]
rf_y_pred_proba_disability_0 = clf_rf.predict_proba(X_test_disability_0)[:, 0]

In [150]:
# Round the values of probabilities
# Step 0.1 => 11 values from 0.0 to 1.0, round at 0.5

# Gender
lr_y_pred_proba_gender_1 = np.around(lr_y_pred_proba_gender_1, decimals=1)
kn_y_pred_proba_gender_1 = np.around(kn_y_pred_proba_gender_1, decimals=1)
dt_y_pred_proba_gender_1 = np.around(dt_y_pred_proba_gender_1, decimals=1)
rf_y_pred_proba_gender_1 = np.around(rf_y_pred_proba_gender_1, decimals=1)

lr_y_pred_proba_gender_0 = np.around(lr_y_pred_proba_gender_0, decimals=1)
kn_y_pred_proba_gender_0 = np.around(kn_y_pred_proba_gender_0, decimals=1)
dt_y_pred_proba_gender_0 = np.around(dt_y_pred_proba_gender_0, decimals=1)
rf_y_pred_proba_gender_0 = np.around(rf_y_pred_proba_gender_0, decimals=1)

# Imd band
lr_y_pred_proba_imdband_1 = np.around(lr_y_pred_proba_imdband_1, decimals=1)
kn_y_pred_proba_imdband_1 = np.around(kn_y_pred_proba_imdband_1, decimals=1)
dt_y_pred_proba_imdband_1 = np.around(dt_y_pred_proba_imdband_1, decimals=1)
rf_y_pred_proba_imdband_1 = np.around(rf_y_pred_proba_imdband_1, decimals=1)

lr_y_pred_proba_imdband_0 = np.around(lr_y_pred_proba_imdband_0, decimals=1)
kn_y_pred_proba_imdband_0 = np.around(kn_y_pred_proba_imdband_0, decimals=1)
dt_y_pred_proba_imdband_0 = np.around(dt_y_pred_proba_imdband_0, decimals=1)
rf_y_pred_proba_imdband_0 = np.around(rf_y_pred_proba_imdband_0, decimals=1)

# Disability
lr_y_pred_proba_disability_1 = np.around(lr_y_pred_proba_disability_1, decimals=1)
kn_y_pred_proba_disability_1 = np.around(kn_y_pred_proba_disability_1, decimals=1)
dt_y_pred_proba_disability_1 = np.around(dt_y_pred_proba_disability_1, decimals=1)
rf_y_pred_proba_disability_1 = np.around(rf_y_pred_proba_disability_1, decimals=1)

lr_y_pred_proba_disability_0 = np.around(lr_y_pred_proba_disability_0, decimals=1)
kn_y_pred_proba_disability_0 = np.around(kn_y_pred_proba_disability_0, decimals=1)
dt_y_pred_proba_disability_0 = np.around(dt_y_pred_proba_disability_0, decimals=1)
rf_y_pred_proba_disability_0 = np.around(rf_y_pred_proba_disability_0, decimals=1)

In [151]:
# Compute density vectors

# Gender
dens_lr_y_pred_proba_gender_1 = dense_values(lr_y_pred_proba_gender_1)
dens_kn_y_pred_proba_gender_1 = dense_values(kn_y_pred_proba_gender_1)
dens_dt_y_pred_proba_gender_1 = dense_values(dt_y_pred_proba_gender_1)
dens_rf_y_pred_proba_gender_1 = dense_values(rf_y_pred_proba_gender_1)

dens_lr_y_pred_proba_gender_0 = dense_values(lr_y_pred_proba_gender_0)
dens_kn_y_pred_proba_gender_0 = dense_values(kn_y_pred_proba_gender_0)
dens_dt_y_pred_proba_gender_0 = dense_values(dt_y_pred_proba_gender_0)
dens_rf_y_pred_proba_gender_0 = dense_values(rf_y_pred_proba_gender_0)

# Imd band
dens_lr_y_pred_proba_imdband_1 = dense_values(lr_y_pred_proba_imdband_1)
dens_kn_y_pred_proba_imdband_1 = dense_values(kn_y_pred_proba_imdband_1)
dens_dt_y_pred_proba_imdband_1 = dense_values(dt_y_pred_proba_imdband_1)
dens_rf_y_pred_proba_imdband_1 = dense_values(rf_y_pred_proba_imdband_1)

dens_lr_y_pred_proba_imdband_0 = dense_values(lr_y_pred_proba_imdband_0)
dens_kn_y_pred_proba_imdband_0 = dense_values(kn_y_pred_proba_imdband_0)
dens_dt_y_pred_proba_imdband_0 = dense_values(dt_y_pred_proba_imdband_0)
dens_rf_y_pred_proba_imdband_0 = dense_values(rf_y_pred_proba_imdband_0)

# Disability
dens_lr_y_pred_proba_disability_1 = dense_values(lr_y_pred_proba_disability_1)
dens_kn_y_pred_proba_disability_1 = dense_values(kn_y_pred_proba_disability_1)
dens_dt_y_pred_proba_disability_1 = dense_values(dt_y_pred_proba_disability_1)
dens_rf_y_pred_proba_disability_1 = dense_values(rf_y_pred_proba_disability_1)

dens_lr_y_pred_proba_disability_0 = dense_values(lr_y_pred_proba_disability_0)
dens_kn_y_pred_proba_disability_0 = dense_values(kn_y_pred_proba_disability_0)
dens_dt_y_pred_proba_disability_0 = dense_values(dt_y_pred_proba_disability_0)
dens_rf_y_pred_proba_disability_0 = dense_values(rf_y_pred_proba_disability_0)

In [152]:
# Difference

# Gender
d_lr_gender = np.absolute(dens_lr_y_pred_proba_gender_1 - dens_lr_y_pred_proba_gender_0).sum()
d_kn_gender = np.absolute(dens_kn_y_pred_proba_gender_1 - dens_kn_y_pred_proba_gender_0).sum()
d_dt_gender = np.absolute(dens_dt_y_pred_proba_gender_1 - dens_dt_y_pred_proba_gender_0).sum()
d_rf_gender = np.absolute(dens_rf_y_pred_proba_gender_1 - dens_rf_y_pred_proba_gender_0).sum()

# Imd band
d_lr_imdband = np.absolute(dens_lr_y_pred_proba_imdband_1 - dens_lr_y_pred_proba_imdband_0).sum()
d_kn_imdband = np.absolute(dens_kn_y_pred_proba_imdband_1 - dens_kn_y_pred_proba_imdband_0).sum()
d_dt_imdband = np.absolute(dens_dt_y_pred_proba_imdband_1 - dens_dt_y_pred_proba_imdband_0).sum()
d_rf_imdband = np.absolute(dens_rf_y_pred_proba_imdband_1 - dens_rf_y_pred_proba_imdband_0).sum()

# Disability
d_lr_disability = np.absolute(dens_lr_y_pred_proba_disability_1 - dens_lr_y_pred_proba_disability_0).sum()
d_kn_disability = np.absolute(dens_kn_y_pred_proba_disability_1 - dens_kn_y_pred_proba_disability_0).sum()
d_dt_disability = np.absolute(dens_dt_y_pred_proba_disability_1 - dens_dt_y_pred_proba_disability_0).sum()
d_rf_disability = np.absolute(dens_rf_y_pred_proba_disability_1 - dens_rf_y_pred_proba_disability_0).sum()

In [153]:
print(round(d_lr_gender, 2))
print(round(d_kn_gender, 2))
print(round(d_dt_gender, 2))
print(round(d_rf_gender, 2))

1.41
0.92
1.09
1.13


In [154]:
print(round(d_lr_imdband, 2))
print(round(d_kn_imdband, 2))
print(round(d_dt_imdband, 2))
print(round(d_rf_imdband, 2))

1.78
1.11
1.45
1.49


In [155]:
print(round(d_lr_disability, 2))
print(round(d_kn_disability, 2))
print(round(d_dt_disability, 2))
print(round(d_rf_disability, 2))

1.3
0.94
1.15
1.05
