In [11]:
"""
Created on: Fri. 14 Oct. 2022
Updated on: Wed. 2 Nov. 2022
Updated on: Thu. 3 Nov. 2022
Author: Mélina Verger

Compute mean density error metric.
"""

# To exit script
from sys import exit

# To load the trained models
import pickle

# For data manipulation
import pandas as pd
import numpy as np
from scipy.signal import find_peaks

# To print with tabular format
from tabulate import tabulate

# Plotting module
import matplotlib.pyplot as plt

## Loading

In [12]:
DATA = pickle.load(open("../data/DATA", "rb"))
SPLIT = pickle.load(open("../data/SPLIT", "rb"))
SFEATURES = pickle.load(open("../data/SFEATURES", "rb"))

print(DATA)
print(SPLIT)
print(SFEATURES)

stInfo
7030
['gender', 'imd_band', 'disability']


In [13]:
# Load test sets

X_test = pd.read_csv("../data/X_test" + "_" + DATA + "_" + SPLIT + ".csv")
y_test = pd.read_csv("../data/y_test" + "_" + DATA + "_" + SPLIT + ".csv")

In [14]:
# Load trained models

MODELS = pickle.load(open("../data/MODELS", "rb"))  # dict with names and trained models
models = MODELS

In [15]:
if "clf_svc" in models:
    del models["clf_svc"]  # except svc model because no probability outputs 

## Separate data sets by (un-)protected groups

/!\ The following only works for binary sensitive features.

In [16]:
dict_subsets_test = dict()

for sensfeat in SFEATURES:
    # X_test_{sensitive feature and group 1/0}
    dict_subsets_test["X"+ "_test_" + sensfeat + "_"+ "1"] = X_test[X_test[sensfeat] == 1]
    dict_subsets_test["X"+ "_test_" + sensfeat + "_"+ "0"] = X_test[X_test[sensfeat] == 0]
    # y_test_{sensitive feature and group 1/0}
    dict_subsets_test["y"+ "_test_" + sensfeat + "_"+ "1"] = y_test.loc[dict_subsets_test["X" + "_test_" + sensfeat + "_" + "1"].index]
    dict_subsets_test["y"+ "_test_" + sensfeat + "_"+ "0"] = y_test.loc[dict_subsets_test["X" + "_test_" + sensfeat + "_" + "0"].index]

## Prediction **probabilities**

In [17]:
dict_subsets_PP = dict()

for mod_names in models:

    if mod_names == "clf_lr":
        modl = "lr"
    elif mod_names == "clf_kn":
        modl = "kn"
    elif mod_names == "clf_dt":
        modl = "dt" 
    elif mod_names == "clf_rf":
        modl = "rf"
    else:
        print("Invalid model.")
        exit()

    # y_PP for X_test_{sensitive feature and group 1/0}
    for sensfeat in SFEATURES:
        dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "1"] = models[mod_names].predict_proba(dict_subsets_test["X"+ "_test_" + sensfeat + "_"+ "1"])[:, 1]  # [:, 1] because propa of being in the class 1
        dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "0"] = models[mod_names].predict_proba(dict_subsets_test["X"+ "_test_" + sensfeat + "_"+ "0"])[:, 0]  # [:, 1] because propa of being in the class 0

## Mean density error ($\overline{d}$)

/!\ The vectors does not have the same length but it is handled by the density vector.

In [18]:
def normalized_density_vector(pred_proba_array):

    PP_rounded = np.around(pred_proba_array, decimals=2)

    density_vector = np.zeros(101)  # empty
    proba_values = np.linspace(0, 1, 101)  # 101 elems

    for i in range(len(proba_values)):
        compar = proba_values[i]
        count = 0
        for x in PP_rounded:
            if x == compar:
                count = count + 1
        density_vector[i] = count
    
    normalized_density_vec = density_vector / np.sum(density_vector)

    return normalized_density_vec

In [19]:
def mean_density_error(norm_densvect_1, norm_densvect_0):
    return np.absolute(norm_densvect_1 - norm_densvect_0).sum()

In [20]:
d_res = list()

for sensfeat in SFEATURES:

    for mod_name in models:

        if mod_name == "clf_lr":
            modl = "lr"
        elif mod_name == "clf_kn":
            modl = "kn"  # model that generates FutureWarning
        elif mod_name == "clf_dt":
            modl = "dt" 
        elif mod_name == "clf_rf":
            modl = "rf"
        else:
            print("Invalid model.")
            exit()
    
        d_subres = list()
        d_subres.append(sensfeat)
        d_subres.append(modl)
        norm_densvect1 = normalized_density_vector(dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "1"])
        norm_densvect0 = normalized_density_vector(dict_subsets_PP["y" + "_PP_" + modl + "_" + sensfeat + "_" + "0"])
        d = round(mean_density_error(norm_densvect1, norm_densvect0), 2)
        d_subres.append(d)
        d_res.append(d_subres)

print(tabulate(d_res, headers=["Sensitive feature", "Model", "  d     "]))

Sensitive feature    Model        d
-------------------  -------  ----------
gender               lr             1.73
gender               kn             0.92
gender               dt             1.58
gender               rf             1.72
imd_band             lr             1.88
imd_band             kn             1.11
imd_band             dt             1.62
imd_band             rf             1.73
disability           lr             1.76
disability           kn             0.94
disability           dt             1.52
disability           rf             1.56
