In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

from utils import classification_report
from separation_mvp import SeparatedClassifier

In [13]:
url = "https://raw.githubusercontent.com/omarfsosa/datasets/master/fairness_synthetic_data.csv"
df = pd.read_csv(url)
print(df.head())

     y  A   X1   X2        X3
0  0.0  0  1.0  0.0  0.750524
1  0.0  0  0.0  1.0  0.550230
2  1.0  0  1.0  1.0  0.672612
3  0.0  0  1.0  0.0  0.329655
4  0.0  0  1.0  0.0  0.849663


In [3]:
X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(
    df.drop(columns="y"),
    df["y"],
    df["A"],
    test_size=.6,
    random_state=42,
)

In [4]:
clf = LogisticRegression(solver="lbfgs")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [5]:
print(classification_report(y_test, y_pred, A_test))

A    TPR   FPR  
----------------
0    0.41  0.10 
1    0.88  0.11 
All  0.69  0.11 


## Fit a separated classifier

In [6]:
R_train = clf.predict_proba(X_train)[:, 1]
R_test = clf.predict_proba(X_test)[:, 1]

In [7]:
goal_tpr, goal_fpr = 0.83591123066577, 0.2639968121139669

In [8]:
fair_clf = SeparatedClassifier(y_train, R_train, A_train)

In [9]:
fair_clf.fit(goal_fpr, goal_tpr)

In [10]:
for k, v in fair_clf.randomized_thresholds.items():
    print(f"Group {k}: t0={v[0]:.2f}, t1={v[1]:.2f}, p={v[2]:.2f}")

Group 0: t0=0.22, t1=0.22, p=0.50
Group 1: t0=0.03, t1=0.68, p=0.62


In [11]:
y_pred_fair = fair_clf.fair_predict(R_test, A_test)

In [12]:
print(classification_report(y_test, y_pred_fair, A_test))

A    TPR   FPR  
----------------
0    0.84  0.26 
1    0.83  0.28 
All  0.84  0.27 
