In [1]:
import numpy as np
import pandas as pd
import pickle
import json

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from fairlearn.metrics import (
    MetricFrame,
    selection_rate,
    demographic_parity_difference,
    demographic_parity_ratio,
    true_positive_rate,
    false_positive_rate,
    count
)

from fairlearn.datasets import fetch_diabetes_hospital

In [2]:
data = fetch_diabetes_hospital(as_frame=True)
X = data.data.copy()
y = data.target.copy()

dropped_columns = ['readmitted', 'readmit_binary']
X = X.drop(columns=[col for col in dropped_columns if col in X.columns], errors="ignore")

real_data = X.copy()
real_data['readmit_binary'] = (y == 1).astype(int)

real_train, real_test = train_test_split(
    real_data,
    test_size=0.2,
    random_state=66,
    stratify=real_data["readmit_binary"]
)

real_train = real_train.reset_index(drop=True)
real_test = real_test.reset_index(drop=True)

real_train.shape, real_test.shape

((81412, 23), (20354, 23))

In [3]:
def build_model_pipeline(X_train : pd.DataFrame, model):
    cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = [col for col in X_train.columns if col not in cat_cols]

    numeric_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])

    categorical_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ])

    pre = ColumnTransformer(
        transformers = [
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        remainder="drop"
    )

    pipe = Pipeline(steps=[
        ("preprocess", pre),
        ("model", model)
    ])

    return pipe

In [4]:
def error_rate(y_true, y_pred):
    return 1.0 - accuracy_score(y_true, y_pred)

def make_fairness_report(y_true, y_pred, sensitive_features, label=""):
    metrics = {
        "count" : count,
        "selection_rate" : selection_rate,
        "accuracy" : accuracy_score,
        "error_rate" : error_rate,
        "TPR" : true_positive_rate,
        "FPR" : false_positive_rate,
    }

    mf = MetricFrame(
        metrics=metrics,
        y_true=y_true,
        y_pred=y_pred,
        sensitive_features=sensitive_features
    )

    dp_diff = demographic_parity_difference(y_true, y_pred, sensitive_features=sensitive_features)
    dp_ratio = demographic_parity_ratio(y_true, y_pred, sensitive_features=sensitive_features)

    by_group = mf.by_group.copy()
    gaps = {}
    for col in ["selection_rate", "error_rate", "TPR", "FPR"]:
        if col in by_group.columns:
            gaps[f"{col}_gap"] = float(by_group[col].max() - by_group[col].min())
    
    summary = {
        "label" : label,
        "dp_diff" : float(dp_diff),
        "dp_ratio" : float(dp_ratio),
        **gaps,
    }

    return {
        "overall" : mf.overall,
        "by_group" : by_group,
        "summary" : summary
    }

In [5]:
def train_eval_on_real_test(train_df, real_test, target, model, drop_sensitive=False):
    train_df = train_df.copy()
    test_df = real_test.copy()

    sensitive_cols = ["race", "gender"]
    drop_cols = [target]
    if drop_sensitive:
        drop_cols += [col for col in sensitive_cols if col in train_df.columns]

    X_train = train_df.drop(columns=drop_cols, errors="ignore")
    y_train = train_df[target].astype(int)

    X_test = test_df.drop(columns=drop_cols, errors="ignore")
    y_test = test_df[target].astype(int)

    pipe = build_model_pipeline(X_train, model)
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    return pipe, y_test, y_pred

In [6]:
def label_rate_by_group(df, target, sensitive):
    y = df[target].astype(int)

    mf = MetricFrame(
        metrics={
            "count" : count,
            "label_rate" : selection_rate,
        },
        y_true=y,
        y_pred=y,
        sensitive_features=sensitive,
    )

    return mf.overall, mf.by_group

In [7]:
real_overall_r, real_by_r = label_rate_by_group(real_train, "readmit_binary", real_train['race'])
real_overall_g, real_by_g = label_rate_by_group(real_train, "readmit_binary", real_train["gender"])
real_overall_rg, real_by_rg = label_rate_by_group(real_train, "readmit_binary", real_train[["race", "gender"]])

display(real_by_r.sort_values("label_rate", ascending=False).head(10))
display(real_by_g.sort_values("label_rate", ascending=False).head(10))
display(real_by_rg.sort_values("label_rate", ascending=False).head(10))

Unnamed: 0_level_0,count,label_rate
race,Unnamed: 1_level_1,Unnamed: 2_level_1
Caucasian,60958.0,0.11316
AfricanAmerican,15257.0,0.112014
Hispanic,1626.0,0.102706
Asian,499.0,0.1002
Other,1208.0,0.092715
Unknown,1864.0,0.080472


Unnamed: 0_level_0,count,label_rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,43765.0,0.112441
Male,37644.0,0.110642
Unknown/Invalid,3.0,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,count,label_rate
race,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Caucasian,Female,31791.0,0.114938
AfricanAmerican,Male,5956.0,0.113835
Asian,Male,246.0,0.113821
Hispanic,Male,752.0,0.113032
Caucasian,Male,29167.0,0.111222
AfricanAmerican,Female,9301.0,0.110848
Other,Female,604.0,0.096026
Hispanic,Female,874.0,0.093822
Other,Male,603.0,0.089552
Asian,Female,253.0,0.086957


In [8]:
def load_model_syn_data(model_path, sample_len):
    if model_path.exists():
        with model_path.open("rb") as f:
            model = pickle.load(f)
        synthetic_dataset = model.sample(num_rows=sample_len)
        return model, synthetic_dataset

In [9]:
gc_path = Path("../artifacts/gaussian_copuula_diabetes.pkl")
ct_path = Path("../artifacts/ctgan_diabetes.pkl")
tv_path = Path("../artifacts/tvae_diabetes.pkl")

In [10]:
sample_len = len(real_train)
gc_model, gc_gendata = load_model_syn_data(gc_path, sample_len)
gc_gendata.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,A1Cresult,insulin,change,diabetesMed,medicare,medicaid,had_emergency,had_inpatient_days,had_outpatient_days,readmit_binary
0,Caucasian,Male,'Over 60 years','Discharged to Home',Emergency,4,Missing,54,3,9,...,,Steady,Ch,Yes,True,False,False,True,False,False
1,Caucasian,Male,'30 years or younger','Discharged to Home',Emergency,6,Family/GeneralPractice,29,0,15,...,,No,Ch,Yes,False,False,False,False,True,False
2,Caucasian,Male,'30-60 years','Discharged to Home',Emergency,4,Emergency/Trauma,11,0,5,...,,Steady,No,Yes,True,False,False,False,False,False
3,Caucasian,Male,'30-60 years','Discharged to Home',Emergency,2,Missing,6,6,8,...,,No,Ch,No,False,False,False,True,False,True
4,Unknown,Female,'Over 60 years','Discharged to Home',Other,6,InternalMedicine,53,0,27,...,,No,Ch,Yes,False,False,False,False,True,False


In [11]:
ct_model, ct_gendata = load_model_syn_data(ct_path, sample_len)
ct_gendata.head()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,A1Cresult,insulin,change,diabetesMed,medicare,medicaid,had_emergency,had_inpatient_days,had_outpatient_days,readmit_binary
0,AfricanAmerican,Female,'Over 60 years','Discharged to Home',Referral,3,Missing,19,5,6,...,,Steady,No,Yes,True,False,False,False,False,False
1,Caucasian,Female,'Over 60 years',Other,Emergency,6,InternalMedicine,62,0,21,...,,No,No,Yes,True,False,False,False,False,False
2,Hispanic,Male,'30-60 years','Discharged to Home',Emergency,4,Missing,42,2,32,...,,Up,Ch,No,False,False,False,False,False,False
3,Caucasian,Female,'Over 60 years','Discharged to Home',Emergency,2,Missing,46,1,9,...,,No,No,Yes,True,False,True,False,False,True
4,Caucasian,Female,'30-60 years','Discharged to Home',Referral,4,Family/GeneralPractice,40,0,23,...,,No,Ch,Yes,True,False,False,False,False,False


In [12]:
tv_model, tv_gendata = load_model_syn_data(tv_path, sample_len)
tv_gendata.head()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,A1Cresult,insulin,change,diabetesMed,medicare,medicaid,had_emergency,had_inpatient_days,had_outpatient_days,readmit_binary
0,Caucasian,Female,'30-60 years',Other,Emergency,6,Other,50,0,16,...,,No,No,No,True,False,False,False,False,False
1,Caucasian,Female,'Over 60 years',Other,Emergency,2,Missing,3,0,9,...,,No,No,No,False,False,False,False,False,False
2,AfricanAmerican,Male,'30-60 years','Discharged to Home',Emergency,5,Missing,50,3,10,...,,Steady,No,Yes,False,False,False,False,False,False
3,Caucasian,Female,'30-60 years','Discharged to Home',Emergency,8,Other,41,0,18,...,,Steady,No,Yes,False,False,False,False,False,False
4,Caucasian,Female,'30-60 years',Other,Emergency,2,Family/GeneralPractice,43,0,12,...,,No,No,No,False,False,False,True,False,False


In [13]:
syn_datasets = {
    "GaussianCopula" : gc_gendata,
    "CTGAN" : ct_gendata,
    "TVAE" : tv_gendata,
}

In [14]:
for name, syn_train in syn_datasets.items():
    print("\n=== Synthetic dataset:", name, "===\n")
    o, by = label_rate_by_group(syn_train, "readmit_binary", syn_train[["race","gender"]])
    display(by.sort_values("label_rate", ascending=False).head(10))


=== Synthetic dataset: GaussianCopula ===



Unnamed: 0_level_0,Unnamed: 1_level_0,count,label_rate
race,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Asian,Male,260.0,0.134615
Other,Female,588.0,0.117347
Unknown,Female,921.0,0.116178
Hispanic,Female,843.0,0.113879
AfricanAmerican,Male,6511.0,0.113807
AfricanAmerican,Female,8421.0,0.113526
Caucasian,Male,28548.0,0.110165
Caucasian,Female,32749.0,0.109774
Other,Male,624.0,0.107372
Asian,Female,216.0,0.101852



=== Synthetic dataset: CTGAN ===



Unnamed: 0_level_0,Unnamed: 1_level_0,count,label_rate
race,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Other,Unknown/Invalid,4.0,0.5
Caucasian,Female,51913.0,0.171787
Caucasian,Unknown/Invalid,199.0,0.160804
Caucasian,Male,14895.0,0.152736
AfricanAmerican,Female,5343.0,0.140371
Unknown,Unknown/Invalid,38.0,0.131579
Other,Male,493.0,0.117647
Hispanic,Unknown/Invalid,9.0,0.111111
Other,Female,2006.0,0.107677
AfricanAmerican,Male,1280.0,0.099219



=== Synthetic dataset: TVAE ===



Unnamed: 0_level_0,Unnamed: 1_level_0,count,label_rate
race,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Asian,Male,1.0,1.0
Caucasian,Female,43065.0,0.169442
AfricanAmerican,Female,15639.0,0.169384
AfricanAmerican,Male,2583.0,0.166086
Caucasian,Male,18825.0,0.162497
Hispanic,Male,110.0,0.154545
Hispanic,Female,447.0,0.149888
Other,Female,464.0,0.142241
Asian,Female,15.0,0.133333
Other,Male,216.0,0.12037


In [15]:
Target = "readmit_binary"

models = {
    "LogReg": LogisticRegression(max_iter=2000),
    "RandomForest": RandomForestClassifier(
        n_estimators=300,
        random_state=66,
        n_jobs=-1
    ),
}

In [16]:
all_results = []

A_race   = real_test["race"]
A_gender = real_test["gender"]
A_inter  = real_test[["race","gender"]]

def run_one_setting(setting_name, train_df, model_name, model):
    _, y_true, y_pred = train_eval_on_real_test(
        train_df=train_df,
        real_test=real_test,
        target=Target,
        model=model,
        drop_sensitive=False
    )

    acc = accuracy_score(y_true, y_pred)

    rep_race   = make_fairness_report(y_true, y_pred, A_race,   label=f"{setting_name} | {model_name} | race")
    rep_gender = make_fairness_report(y_true, y_pred, A_gender, label=f"{setting_name} | {model_name} | gender")
    rep_inter  = make_fairness_report(y_true, y_pred, A_inter,  label=f"{setting_name} | {model_name} | race×gender")

    return {
        "setting": setting_name,
        "model": model_name,
        "accuracy": acc,
        "race": rep_race,
        "gender": rep_gender,
        "intersection": rep_inter
    }

for model_name, model in models.items():
    res = run_one_setting("RealTrain->RealTest", real_train, model_name, model)
    all_results.append(res)

for synth_name, syn_train in syn_datasets.items():
    for model_name, model in models.items():
        res = run_one_setting(f"{synth_name}Train->RealTest", syn_train, model_name, model)
        all_results.append(res)

In [17]:
len(all_results)

8

In [18]:
for i, res in enumerate(all_results):
    if res is None:
        print("Row", i, "is None.")
    
    for view_name in ["race", "gender", "intersection"]:
        if res.get(view_name) is None:
            print(
                "Missing view: ", view_name,
                "| index: ", i,
                "| setting: ", res.get("setting"),
                "| model: ", res.get("model"),
            )

In [19]:
summary_rows = []

for res in all_results:
    for view_name in ["race", "gender", "intersection"]:
        s = res[view_name]["summary"]
        summary_rows.append({
            "setting": res["setting"],
            "model": res["model"],
            "group_view": view_name,
            "accuracy": res["accuracy"],
            "dp_diff": s["dp_diff"],
            "dp_ratio": s["dp_ratio"],
            "selection_rate_gap": s.get("selection_rate_gap", np.nan),
            "error_rate_gap": s.get("error_rate_gap", np.nan),
            "TPR_gap": s.get("TPR_gap", np.nan),
            "FPR_gap": s.get("FPR_gap", np.nan),
        })

summary_df = pd.DataFrame(summary_rows).sort_values(
    ["group_view", "setting", "model"]
)

display(summary_df)

Unnamed: 0,setting,model,group_view,accuracy,dp_diff,dp_ratio,selection_rate_gap,error_rate_gap,TPR_gap,FPR_gap
13,CTGANTrain->RealTest,LogReg,gender,0.888228,0.000138,0.697673,0.000138,0.002151,0.000149,0.000173
16,CTGANTrain->RealTest,RandomForest,gender,0.885428,0.003923,0.403746,0.003923,0.004197,0.007527,0.003463
7,GaussianCopulaTrain->RealTest,LogReg,gender,0.888425,0.0,,0.0,0.001983,0.0,0.0
10,GaussianCopulaTrain->RealTest,RandomForest,gender,0.888277,0.000121,0.430001,0.000121,0.001862,0.0,0.000136
1,RealTrain->RealTest,LogReg,gender,0.888425,0.0,,0.0,0.001983,0.0,0.0
4,RealTrain->RealTest,RandomForest,gender,0.887442,0.000295,0.859452,0.000295,0.002032,0.001028,0.000197
19,TVAETrain->RealTest,LogReg,gender,0.888228,0.000667,0.215,0.000667,0.001953,0.002885,0.000391
22,TVAETrain->RealTest,RandomForest,gender,0.872998,0.005961,0.771112,0.005961,0.005577,0.009997,0.005425
14,CTGANTrain->RealTest,LogReg,intersection,0.888228,0.000633,0.0,0.000633,0.138062,0.001269,0.000572
17,CTGANTrain->RealTest,RandomForest,intersection,0.885428,0.00899,0.0,0.00899,0.138062,0.015453,0.008152


In [20]:
def show_group_table(setting_contains: str, model_name: str, view="intersection", metric_sort="selection_rate"):
    for res in all_results:
        if (setting_contains in res["setting"]) and (res["model"] == model_name):
            by = res[view]["by_group"].copy()
            display(by.sort_values(metric_sort, ascending=False).head(30))
            return
    print("Not found.")

show_group_table("CTGANTrain->RealTest", "LogReg", view="intersection", metric_sort="error_rate")

Unnamed: 0_level_0,Unnamed: 1_level_0,count,selection_rate,accuracy,error_rate,TPR,FPR
race,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Asian,Male,77.0,0.0,0.831169,0.168831,0.0,0.0
Hispanic,Male,193.0,0.0,0.860104,0.139896,0.0,0.0
Other,Female,144.0,0.0,0.868056,0.131944,0.0,0.0
AfricanAmerican,Male,1526.0,0.0,0.88401,0.11599,0.0,0.0
Caucasian,Female,7898.0,0.000633,0.884908,0.115092,0.001104,0.000572
AfricanAmerican,Female,2427.0,0.0,0.889164,0.110836,0.0,0.0
Caucasian,Male,7243.0,0.000414,0.891067,0.108933,0.001269,0.00031
Unknown,Male,218.0,0.0,0.90367,0.09633,0.0,0.0
Other,Male,154.0,0.0,0.909091,0.090909,0.0,0.0
Unknown,Female,191.0,0.0,0.910995,0.089005,0.0,0.0


In [21]:
out_dir = Path("../artifacts/fairness_report")
out_dir.mkdir(parents=True, exist_ok=True)

summary_df.to_csv(out_dir / "fairness_summary.csv", index=False)

for res in all_results:
    for view_name in ["race", "gender", "intersection"]:
        by = res[view_name]["by_group"].reset_index()
        safe_name = res["setting"].replace("->", "_to_").replace("×", "x").replace(" ", "_")
        by.to_csv(out_dir / f"bygroup__{safe_name}__{res['model']}__{view_name}.csv", index=False)

print("Saved to:", out_dir)

Saved to: ../artifacts/fairness_report


2026-03-02 06:13:54.963 
  command:

    streamlit run /home/pengbaoz/.conda/envs/capstone_project/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
2026-03-02 06:13:55.556 Session state does not function when running a script without `streamlit run`


DeltaGenerator()