In [1]:
"""
Created on: Fri. 14 Oct. 2022
Author: Mélina Verger

Manually compute a distance between the distributions of the same feature.
"""

# To load the trained models
import pickle

# For data manipulation
import pandas as pd
import numpy as np

## Loading

In [2]:
DATA = pickle.load(open("../data/DATA", "rb"))
SPLIT = pickle.load(open("../data/SPLIT", "rb"))

print(DATA)
print(SPLIT)

stAll
7030


In [3]:
# Load test sets

X_test = pd.read_csv("../data/X_test" + "_" + DATA + "_" + SPLIT + ".csv")
y_test = pd.read_csv("../data/y_test" + "_" + DATA + "_" + SPLIT + ".csv")

In [4]:
# Load trained models

clf_lr = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_lr", "rb"))
clf_svc = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_svc", "rb"))
clf_knearest = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_knearest", "rb"))
clf_dt = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_dt", "rb"))
clf_rf = pickle.load(open("../models/" + DATA + "_" + SPLIT + "_clf_rf", "rb"))

## Separate data sets by (un-)protected groups

At that moment, I chose 'gender', 'imd_band' and 'disability' as interesting features to evaluate fairness with.

In [5]:
X_test_gender_1 = X_test[X_test["gender"] == 1]
X_test_gender_0 = X_test[X_test["gender"] == 0]

y_test_gender_1 = y_test.loc[X_test_gender_1.index]
y_test_gender_0 = y_test.loc[X_test_gender_0.index]

In [6]:
X_test_imdband_1 = X_test[X_test["imd_band"] == 1]
X_test_imdband_0 = X_test[X_test["imd_band"] == 0]

y_test_imdband_1 = y_test.loc[X_test_imdband_1.index]
y_test_imdband_0 = y_test.loc[X_test_imdband_0.index]

In [7]:
X_test_disability_1 = X_test[X_test["disability"] == 1]
X_test_disability_0 = X_test[X_test["disability"] == 0]

y_test_disability_1 = y_test.loc[X_test_disability_1.index]
y_test_disability_0 = y_test.loc[X_test_disability_0.index]

## Manually compute the "distance vector"

### Prediction probability

No predict proba for SVC.

In [8]:
lr_y_pred_proba_gender_1 = clf_lr.predict_proba(X_test_gender_1)[:, 1]
kn_y_pred_proba_gender_1 = clf_knearest.predict_proba(X_test_gender_1)[:, 1]
dt_y_pred_proba_gender_1 = clf_dt.predict_proba(X_test_gender_1)[:, 1]
rf_y_pred_proba_gender_1 = clf_rf.predict_proba(X_test_gender_1)[:, 1]

lr_y_pred_proba_gender_0 = clf_lr.predict_proba(X_test_gender_0)[:, 0]
kn_y_pred_proba_gender_0 = clf_knearest.predict_proba(X_test_gender_0)[:, 0]
dt_y_pred_proba_gender_0 = clf_dt.predict_proba(X_test_gender_0)[:, 0]
rf_y_pred_proba_gender_0 = clf_rf.predict_proba(X_test_gender_0)[:, 0]

In [9]:
lr_y_pred_proba_imdband_1 = clf_lr.predict_proba(X_test_imdband_1)[:, 1]
kn_y_pred_proba_imdband_1 = clf_knearest.predict_proba(X_test_imdband_1)[:, 1]
dt_y_pred_proba_imdband_1 = clf_dt.predict_proba(X_test_imdband_1)[:, 1]
rf_y_pred_proba_imdband_1 = clf_rf.predict_proba(X_test_imdband_1)[:, 1]

lr_y_pred_proba_imdband_0 = clf_lr.predict_proba(X_test_imdband_0)[:, 0]
kn_y_pred_proba_imdband_0 = clf_knearest.predict_proba(X_test_imdband_0)[:, 0]
dt_y_pred_proba_imdband_0 = clf_dt.predict_proba(X_test_imdband_0)[:, 0]
rf_y_pred_proba_imdband_0 = clf_rf.predict_proba(X_test_imdband_0)[:, 0]

In [10]:
lr_y_pred_proba_disability_1 = clf_lr.predict_proba(X_test_disability_1)[:, 1]
kn_y_pred_proba_disability_1 = clf_knearest.predict_proba(X_test_disability_1)[:, 1]
dt_y_pred_proba_disability_1 = clf_dt.predict_proba(X_test_disability_1)[:, 1]
rf_y_pred_proba_disability_1 = clf_rf.predict_proba(X_test_disability_1)[:, 1]

lr_y_pred_proba_disability_0 = clf_lr.predict_proba(X_test_disability_0)[:, 0]
kn_y_pred_proba_disability_0 = clf_knearest.predict_proba(X_test_disability_0)[:, 0]
dt_y_pred_proba_disability_0 = clf_dt.predict_proba(X_test_disability_0)[:, 0]
rf_y_pred_proba_disability_0 = clf_rf.predict_proba(X_test_disability_0)[:, 0]

The vectors does not have the same length as it depends on the number of 1 and 0 they have in their class.

In [11]:
def dense_values(vector):
    c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10 = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

    for x in vector:
        if x == 0.0:
            c0 = c0 + 1
        elif x == 0.1:
            c1 = c1 + 1
        elif x == 0.2:
            c2 = c2 + 1
        elif x == 0.3:
            c3 = c3 + 1
        elif x == 0.4:
            c4 = c4 + 1
        elif x == 0.5:
            c5 = c5 + 1
        elif x == 0.6:
            c6 = c6 + 1
        elif x == 0.7:
            c7 = c7 + 1
        elif x == 0.8:
            c8 = c8 + 1
        elif x == 0.9:
            c9 = c9 + 1
        else:
            c10 = c10 + 1
    
    v = np.array([c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10])
    v_normalized = v / np.sum(v)

    return v_normalized

In [12]:
# Round the values of probabilities
# Step 0.1 => 11 values from 0.0 to 1.0, round at 0.5

# Gender
lr_y_pred_proba_gender_1 = np.around(lr_y_pred_proba_gender_1, decimals=1)
kn_y_pred_proba_gender_1 = np.around(kn_y_pred_proba_gender_1, decimals=1)
dt_y_pred_proba_gender_1 = np.around(dt_y_pred_proba_gender_1, decimals=1)
rf_y_pred_proba_gender_1 = np.around(rf_y_pred_proba_gender_1, decimals=1)

lr_y_pred_proba_gender_0 = np.around(lr_y_pred_proba_gender_0, decimals=1)
kn_y_pred_proba_gender_0 = np.around(kn_y_pred_proba_gender_0, decimals=1)
dt_y_pred_proba_gender_0 = np.around(dt_y_pred_proba_gender_0, decimals=1)
rf_y_pred_proba_gender_0 = np.around(rf_y_pred_proba_gender_0, decimals=1)

# Imd band
lr_y_pred_proba_imdband_1 = np.around(lr_y_pred_proba_imdband_1, decimals=1)
kn_y_pred_proba_imdband_1 = np.around(kn_y_pred_proba_imdband_1, decimals=1)
dt_y_pred_proba_imdband_1 = np.around(dt_y_pred_proba_imdband_1, decimals=1)
rf_y_pred_proba_imdband_1 = np.around(rf_y_pred_proba_imdband_1, decimals=1)

lr_y_pred_proba_imdband_0 = np.around(lr_y_pred_proba_imdband_0, decimals=1)
kn_y_pred_proba_imdband_0 = np.around(kn_y_pred_proba_imdband_0, decimals=1)
dt_y_pred_proba_imdband_0 = np.around(dt_y_pred_proba_imdband_0, decimals=1)
rf_y_pred_proba_imdband_0 = np.around(rf_y_pred_proba_imdband_0, decimals=1)

# Disability
lr_y_pred_proba_disability_1 = np.around(lr_y_pred_proba_disability_1, decimals=1)
kn_y_pred_proba_disability_1 = np.around(kn_y_pred_proba_disability_1, decimals=1)
dt_y_pred_proba_disability_1 = np.around(dt_y_pred_proba_disability_1, decimals=1)
rf_y_pred_proba_disability_1 = np.around(rf_y_pred_proba_disability_1, decimals=1)

lr_y_pred_proba_disability_0 = np.around(lr_y_pred_proba_disability_0, decimals=1)
kn_y_pred_proba_disability_0 = np.around(kn_y_pred_proba_disability_0, decimals=1)
dt_y_pred_proba_disability_0 = np.around(dt_y_pred_proba_disability_0, decimals=1)
rf_y_pred_proba_disability_0 = np.around(rf_y_pred_proba_disability_0, decimals=1)

In [13]:
# Compute density vectors

# Gender
dens_lr_y_pred_proba_gender_1 = dense_values(lr_y_pred_proba_gender_1)
dens_kn_y_pred_proba_gender_1 = dense_values(kn_y_pred_proba_gender_1)
dens_dt_y_pred_proba_gender_1 = dense_values(dt_y_pred_proba_gender_1)
dens_rf_y_pred_proba_gender_1 = dense_values(rf_y_pred_proba_gender_1)

dens_lr_y_pred_proba_gender_0 = dense_values(lr_y_pred_proba_gender_0)
dens_kn_y_pred_proba_gender_0 = dense_values(kn_y_pred_proba_gender_0)
dens_dt_y_pred_proba_gender_0 = dense_values(dt_y_pred_proba_gender_0)
dens_rf_y_pred_proba_gender_0 = dense_values(rf_y_pred_proba_gender_0)

# Imd band
dens_lr_y_pred_proba_imdband_1 = dense_values(lr_y_pred_proba_imdband_1)
dens_kn_y_pred_proba_imdband_1 = dense_values(kn_y_pred_proba_imdband_1)
dens_dt_y_pred_proba_imdband_1 = dense_values(dt_y_pred_proba_imdband_1)
dens_rf_y_pred_proba_imdband_1 = dense_values(rf_y_pred_proba_imdband_1)

dens_lr_y_pred_proba_imdband_0 = dense_values(lr_y_pred_proba_imdband_0)
dens_kn_y_pred_proba_imdband_0 = dense_values(kn_y_pred_proba_imdband_0)
dens_dt_y_pred_proba_imdband_0 = dense_values(dt_y_pred_proba_imdband_0)
dens_rf_y_pred_proba_imdband_0 = dense_values(rf_y_pred_proba_imdband_0)

# Disability
dens_lr_y_pred_proba_disability_1 = dense_values(lr_y_pred_proba_disability_1)
dens_kn_y_pred_proba_disability_1 = dense_values(kn_y_pred_proba_disability_1)
dens_dt_y_pred_proba_disability_1 = dense_values(dt_y_pred_proba_disability_1)
dens_rf_y_pred_proba_disability_1 = dense_values(rf_y_pred_proba_disability_1)

dens_lr_y_pred_proba_disability_0 = dense_values(lr_y_pred_proba_disability_0)
dens_kn_y_pred_proba_disability_0 = dense_values(kn_y_pred_proba_disability_0)
dens_dt_y_pred_proba_disability_0 = dense_values(dt_y_pred_proba_disability_0)
dens_rf_y_pred_proba_disability_0 = dense_values(rf_y_pred_proba_disability_0)

In [14]:
# Difference

# Gender
d_lr_gender = np.absolute(dens_lr_y_pred_proba_gender_1 - dens_lr_y_pred_proba_gender_0).sum()
d_kn_gender = np.absolute(dens_kn_y_pred_proba_gender_1 - dens_kn_y_pred_proba_gender_0).sum()
d_dt_gender = np.absolute(dens_dt_y_pred_proba_gender_1 - dens_dt_y_pred_proba_gender_0).sum()
d_rf_gender = np.absolute(dens_rf_y_pred_proba_gender_1 - dens_rf_y_pred_proba_gender_0).sum()

# Imd band
d_lr_imdband = np.absolute(dens_lr_y_pred_proba_imdband_1 - dens_lr_y_pred_proba_imdband_0).sum()
d_kn_imdband = np.absolute(dens_kn_y_pred_proba_imdband_1 - dens_kn_y_pred_proba_imdband_0).sum()
d_dt_imdband = np.absolute(dens_dt_y_pred_proba_imdband_1 - dens_dt_y_pred_proba_imdband_0).sum()
d_rf_imdband = np.absolute(dens_rf_y_pred_proba_imdband_1 - dens_rf_y_pred_proba_imdband_0).sum()

# Disability
d_lr_disability = np.absolute(dens_lr_y_pred_proba_disability_1 - dens_lr_y_pred_proba_disability_0).sum()
d_kn_disability = np.absolute(dens_kn_y_pred_proba_disability_1 - dens_kn_y_pred_proba_disability_0).sum()
d_dt_disability = np.absolute(dens_dt_y_pred_proba_disability_1 - dens_dt_y_pred_proba_disability_0).sum()
d_rf_disability = np.absolute(dens_rf_y_pred_proba_disability_1 - dens_rf_y_pred_proba_disability_0).sum()

In [15]:
print(round(d_lr_gender, 2))
print(round(d_kn_gender, 2))
print(round(d_dt_gender, 2))
print(round(d_rf_gender, 2))

1.43
1.2
0.93
1.06


In [16]:
print(round(d_lr_imdband, 2))
print(round(d_kn_imdband, 2))
print(round(d_dt_imdband, 2))
print(round(d_rf_imdband, 2))

1.53
1.21
0.9
1.08


In [17]:
print(round(d_lr_disability, 2))
print(round(d_kn_disability, 2))
print(round(d_dt_disability, 2))
print(round(d_rf_disability, 2))

1.31
1.03
0.83
0.93
