In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference

import shap
from lime.lime_tabular import LimeTabularExplainer
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("../data/adult.csv")
df = df.replace('?', np.nan).dropna()

for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()

df['income'] = df['income'].map({'>50K': 1, '<=50K': 0})

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,0
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,0


In [3]:
df_fairness = df[['sex', 'race' ]].copy()
df_fairness.head()

Unnamed: 0,sex,race
1,Female,White
3,Female,White
4,Female,White
5,Female,White
6,Male,White


In [4]:
y = df['income']
X = df.drop(columns=['income'])

# one-hot encode categoricals
X = pd.get_dummies(X, drop_first=True)

X.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
1,82,132870,9,0,4356,18,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,54,140359,4,0,3900,40,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
4,41,264663,10,0,3900,40,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
5,34,216864,9,0,3770,45,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
6,38,150601,6,0,3770,40,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False


In [5]:
import json

feature_list = X.columns.tolist()

with open("feature_list.json", "w") as f:
    json.dump(feature_list, f)

print("feature_list.json saved!")


feature_list.json saved!


In [6]:
from sklearn.model_selection import train_test_split

A_sex = df_fairness['sex']
A_race = df_fairness['race']

X_train, X_test, y_train, y_test, A_sex_train, A_sex_test, A_race_train, A_race_test = train_test_split(
    X, y,
    A_sex, A_race,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (24129, 96)
Test shape: (6033, 96)
