In [5]:
# Install required packages if not already installed
# !pip install fairlearn scikit-learn pandas numpy

# Import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from fairlearn.reductions import ExponentiatedGradient, DemographicParity
from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference

In [7]:
# Load dataset (Adult income dataset from OpenML)
data = fetch_openml(data_id=1590, as_frame=True)
df = data.frame

# Drop rows with missing values
df = df.dropna()

In [9]:
# Define features and target
X = df.drop('class', axis=1)
y = (df['class'] == '>50K').astype(int)

# Encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)
y = y.reset_index(drop=True)

In [11]:
# Define sensitive feature for fairness analysis
A = X['sex'].reset_index(drop=True)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(
    X_encoded, y, A, test_size=0.3, random_state=42
)

In [13]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# Train baseline logistic regression model (without bias mitigation)
clf = LogisticRegression(max_iter=2000)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)

In [17]:
# Evaluate selection rates by group (no mitigation)
mf = MetricFrame(metrics=selection_rate, y_true=y_test, y_pred=y_pred, sensitive_features=A_test)
print("Selection rates (no mitigation):")
print(mf.by_group)

Selection rates (no mitigation):
sex
Female    0.078219
Male      0.261562
Name: selection_rate, dtype: float64


In [19]:
# Compute demographic parity difference (no mitigation)
print("\nDemographic Parity Difference (no mitigation):")
print(demographic_parity_difference(y_test, y_pred, sensitive_features=A_test))

# Train fair model using Exponentiated Gradient with Demographic Parity constraint
fair_model = ExponentiatedGradient(
    estimator=LogisticRegression(solver='liblinear'),
    constraints=DemographicParity(),
    eps=0.01
)
fair_model.fit(X_train_scaled, y_train, sensitive_features=A_train)
y_pred_fair = fair_model.predict(X_test_scaled)


Demographic Parity Difference (no mitigation):
0.1833430536551267


In [20]:
# Evaluate selection rates by group (with mitigation)
mf_fair = MetricFrame(metrics=selection_rate, y_true=y_test, y_pred=y_pred_fair, sensitive_features=A_test)
print("\nSelection rates (with mitigation):")
print(mf_fair.by_group)


Selection rates (with mitigation):
sex
Female    0.156207
Male      0.174808
Name: selection_rate, dtype: float64


In [21]:
# Compute demographic parity difference (with mitigation)
print("\nDemographic Parity Difference (with mitigation):")
print(demographic_parity_difference(y_test, y_pred_fair, sensitive_features=A_test))


Demographic Parity Difference (with mitigation):
0.01860101736758024


In [22]:
# Compare model accuracy
print("\nAccuracy (no mitigation):", accuracy_score(y_test, y_pred))
print("Accuracy (with mitigation):", accuracy_score(y_test, y_pred_fair))


Accuracy (no mitigation): 0.850077393675831
Accuracy (with mitigation): 0.8339352841453527
