<a href="https://colab.research.google.com/github/mikemaurrasse-hash/Final-Project/blob/final-deliverable/AIM460_Final_READY_FINAL_PATCHED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AIM460 Final Project ‚Äî Fairness Benchmark
**Author:** Mike Maurrasse


## 1) Introduction

This AIM460 Final Project explores fairness benchmarking across multiple datasets ‚Äî Folktables ACSIncome, COMPAS, CivilComments, FairJob, and FairFace ‚Äî using logistic-regression baselines and fairness-aware methods. The goal is to evaluate model accuracy and equity gaps across demographic groups, highlighting trade-offs between performance and fairness in real-world data.  
Through this cross-domain comparison, the project demonstrates how bias manifests differently across text, vision, and structured data, providing a unified framework for quantitative fairness evaluation and responsible AI deployment.


## 2) Setup & Environment Initialization


In [1]:

# Quiet installs and core dirs
import os, sys, urllib.request, pathlib, warnings
warnings.filterwarnings("ignore")
os.environ.update({
    "WANDB_DISABLED":"true","WANDB_MODE":"disabled","WANDB_SILENT":"true","DISABLE_WANDB":"true"
})
!pip -q install folktables datasets transformers torch torchvision scikit-learn matplotlib pandas numpy fairlearn

BASE="/content/data"; EXPORT_DIR="/content/exports"; FAIRFACE=f"{BASE}/fairface"
os.makedirs(BASE, exist_ok=True); os.makedirs(EXPORT_DIR, exist_ok=True); os.makedirs(FAIRFACE, exist_ok=True)
print("Data:", BASE); print("Exports:", EXPORT_DIR)

# Helper fetch
def fetch(url, dst):
    dst = pathlib.Path(dst)
    if not dst.exists():
        print("‚Üì", dst.name); urllib.request.urlretrieve(url, str(dst))
    else: print("‚úì", dst.name, "(exists)")

# COMPAS csv
fetch("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv",
      f"{BASE}/compas.csv")

# FairFace label csvs (small, labels only)
import os
if not any(p.endswith(".csv") for p in os.listdir(FAIRFACE)):
    import subprocess, shlex
    subprocess.run(shlex.split(f"wget -q -P {FAIRFACE} https://github.com/joojs/fairface/raw/master/fairface_label_val.csv"))
    subprocess.run(shlex.split(f"wget -q -P {FAIRFACE} https://github.com/joojs/fairface/raw/master/fairface_label_train.csv"))
print("FairFace labels ready.")
print("CivilComments: upload train.csv to /content/data/civilcomments/train.csv when you reach Section 7.")


Data: /content/data
Exports: /content/exports
‚úì compas.csv (exists)
FairFace labels ready.
CivilComments: upload train.csv to /content/data/civilcomments/train.csv when you reach Section 7.


## 3) Fairness Metrics and Evaluation Utilities


In [2]:

import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score
import os

RESULTS_SUMMARY = []
EXPORT_DIR = "/content/exports"

def _tpr_fpr(y_true, y_pred, pos=1):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    tp = np.sum((y_true==pos)&(y_pred==pos)); fn = np.sum((y_true==pos)&(y_pred!=pos))
    tn = np.sum((y_true!=pos)&(y_pred!=pos)); fp = np.sum((y_true!=pos)&(y_pred==pos))
    tpr = tp/(tp+fn+1e-12); fpr = fp/(fp+tn+1e-12); return tpr, fpr, 1-fpr

def _group_gaps(y_true, y_pred, g):
    df=pd.DataFrame({"y":y_true,"yp":y_pred,"g":g})
    tprs, tnrs = [], []
    for _,sub in df.groupby("g"):
        tpr,fpr,tnr=_tpr_fpr(sub.y, sub.yp); tprs.append(tpr); tnrs.append(tnr)
    eo_gap=float(np.max(tprs)-np.min(tprs)) if tprs else np.nan
    tnr_gap=float(np.max(tnrs)-np.min(tnrs)) if tnrs else np.nan
    return {"eo_tpr_gap":eo_gap, "eqodds_tpr_gap":eo_gap, "eqodds_tnr_gap":tnr_gap}

def evaluate_binary(y_true, y_score, y_pred, g):
    out={"accuracy": float(accuracy_score(y_true, y_pred))}
    try: out["auc"]=float(roc_auc_score(y_true, y_score))
    except: out["auc"]=float("nan")
    out.update(_group_gaps(y_true, y_pred, g)); return out

def add_summary_row(dataset, method, metrics):
    row={"dataset":dataset,"method":method}; row.update(metrics); RESULTS_SUMMARY.append(row)

def flush_summary_csv(path="results_summary_all.csv"):
    os.makedirs(EXPORT_DIR, exist_ok=True)
    out = pd.DataFrame(RESULTS_SUMMARY)
    out.to_csv(os.path.join(EXPORT_DIR, path), index=False)
    return os.path.join(EXPORT_DIR, path)

def tradeoff_plot(df, x, y, title, out_png):
    plt.figure(); plt.scatter(df[x], df[y])
    for _,r in df.iterrows():
        if isinstance(r.get("label"), str): plt.annotate(r["label"], (r[x], r[y]))
    plt.xlabel(f"{x} (‚Üë)"); plt.ylabel(f"{y} (‚Üì)"); plt.title(title); plt.grid(True); plt.savefig(out_png, bbox_inches="tight"); plt.close()

def save_results_csv(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True); df.to_csv(path, index=False)


## 4) Folktables ‚Äî ACSIncome


In [3]:
# Folktables ‚Äî ACSIncome (NaN-safe baseline)
import numpy as np, pandas as pd
from folktables import ACSDataSource, ACSIncome
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Load CA 2018 sample
ds = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
data = ds.get_data(states=['CA'], download=True)

X_np, y_np, g_np = ACSIncome.df_to_numpy(data)
X = pd.DataFrame(X_np)
y = y_np.astype(int)
g = g_np.astype(int)

Xtr, Xte, ytr, yte, gtr, gte = train_test_split(
    X, y, g, test_size=0.2, random_state=42, stratify=y
)

# Numeric pipeline with imputation + scaling (handles NaNs)
num_cols = list(range(X.shape[1]))
pre = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=False))
        ]), num_cols),
    ],
    remainder="drop"
)

clf = Pipeline([
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=300))
])

clf.fit(Xtr, ytr)
ys = clf.predict_proba(Xte)[:, 1]
yp = (ys >= 0.5).astype(int)

m = evaluate_binary(yte, ys, yp, gte)
add_summary_row("acs_income", "logreg_baseline", m)
print("ACSIncome:", m)


Downloading data for 2018 1-Year person survey for CA...
ACSIncome: {'accuracy': 0.7866250990212864, 'auc': 0.8613494310877198, 'eo_tpr_gap': 0.7448443944506935, 'eqodds_tpr_gap': 0.7448443944506935, 'eqodds_tnr_gap': 0.19128787878781628}


## 5) COMPAS ‚Äî Fairness Evaluation


In [4]:
# COMPAS ‚Äî NaN-safe baseline (imputers for numeric & categorical)
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
import inspect

df = pd.read_csv("/content/data/compas.csv")

# Target & sensitive
target = "two_year_recid"
df = df.dropna(subset=[target])  # ensure y has no NaNs
y = df[target].astype(int).values

sens = (df["sex"].astype(str).str.lower().str.contains("female")).astype(int).values        if "sex" in df.columns else np.zeros(len(df), dtype=int)

feat = [c for c in df.columns if c not in [target, "sex"]]
X = df[feat].copy()

# Split numeric vs categorical
num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
cat_cols = [c for c in X.columns if c not in num_cols]

# Robust OneHotEncoder args across sklearn versions
if "sparse_output" in inspect.signature(OneHotEncoder).parameters:  # sklearn >= 1.6
    OHE_ARGS = {"handle_unknown":"ignore", "sparse_output": True}
else:
    OHE_ARGS = {"handle_unknown":"ignore", "sparse": True}

pre = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=False))
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(**OHE_ARGS))
        ]), cat_cols),
    ],
    remainder="drop"
)

Xtr, Xte, ytr, yte, gtr, gte = train_test_split(
    X, y, sens, test_size=0.2, random_state=42, stratify=y
)

pipe = Pipeline([
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=400))
])

pipe.fit(Xtr, ytr)
ys = pipe.predict_proba(Xte)[:, 1]
yp = (ys >= 0.5).astype(int)

m = evaluate_binary(yte, ys, yp, gte)
add_summary_row("compas", "logreg_baseline", m)
print("COMPAS:", m)


COMPAS: {'accuracy': 0.9826749826749827, 'auc': 0.9983431952662722, 'eo_tpr_gap': 0.0073260073259996394, 'eqodds_tpr_gap': 0.0073260073259996394, 'eqodds_tnr_gap': 0.013660189548366564}


## 6) FairJob (Hugging Face)


In [7]:

# 6) FairJob (Hugging Face) ‚Äî no local CSV needed, NaN-safe baseline
# Assumes Section 2 (Setup) installed 'datasets' and Section 3 defined evaluate_binary/add_summary_row.

from datasets import load_dataset
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import inspect

# 1) Load FairJob directly from Hugging Face
ds = load_dataset("criteo/FairJob", split="train")   # requires internet on first run
df = pd.DataFrame(ds)
print("FairJob columns (first 40):", list(df.columns)[:40])

# 2) Explicit schema for your split
target   = "click"                 # binary target (0/1)
sens_col = "protected_attribute"   # binary sensitive attribute (0/1)

if target not in df.columns or sens_col not in df.columns:
    raise KeyError(f"Missing expected columns: {target}, {sens_col}")

# 3) Build y (target) and s (sensitive)
y = pd.to_numeric(df[target], errors="coerce").fillna(0).astype(int).values
s = pd.to_numeric(df[sens_col], errors="coerce").fillna(0).astype(int).values

# 4) Features and NaN-safe preprocessing
X = df.drop(columns=[target, sens_col])

num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if c not in num_cols]

# Version-safe OneHotEncoder args (sklearn changed 'sparse' -> 'sparse_output')
if "sparse_output" in inspect.signature(OneHotEncoder).parameters:  # sklearn >= 1.6
    OHE_ARGS = {"handle_unknown": "ignore", "sparse_output": True}
else:
    OHE_ARGS = {"handle_unknown": "ignore", "sparse": True}

pre = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler(with_mean=False))
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(**OHE_ARGS))
        ]), cat_cols)
    ],
    remainder="drop"
)

# 5) Train/Test split and model
Xtr, Xte, ytr, yte, str_, ste = train_test_split(
    X, y, s, test_size=0.2, random_state=42, stratify=y
)

pipe = Pipeline([
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=400))
])

pipe.fit(Xtr, ytr)
ys = pipe.predict_proba(Xte)[:, 1]
yp = (ys >= 0.5).astype(int)

# 6) Log metrics (uses Section 3 helpers)
m = evaluate_binary(yte, ys, yp, ste)
add_summary_row("fairjob", "logreg_baseline", m)
print("FairJob baseline:", m)


FairJob columns (first 40): ['click', 'protected_attribute', 'senior', 'displayrandom', 'rank', 'user_id', 'impression_id', 'product_id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'num16', 'num17', 'num18', 'num19', 'num20', 'num21', 'num22', 'num23', 'num24', 'num25', 'num26', 'num27', 'num28', 'num29', 'num30', 'num31', 'num32', 'num33', 'num34']
FairJob baseline: {'accuracy': 0.9929679266575269, 'auc': 0.7812972537119137, 'eo_tpr_gap': 0.0, 'eqodds_tpr_gap': 0.0, 'eqodds_tnr_gap': 9.399467990112154e-05}


## 7) CivilComments ‚Äî Manual Upload + Baseline


In [9]:

# 6) CivilComments ‚Äî Manual Upload + TF-IDF Baseline (NaN-safe + memory-safe)
# Upload train.csv to /content/data/civilcomments/train.csv before running.
import os, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

p = "/content/data/civilcomments/train.csv"
assert os.path.exists(p), f"Missing file: {p}"

# ‚úÖ Stream in only 50k rows for manageable runtime
df = pd.read_csv(p, nrows=50000, low_memory=False)
print(f"Loaded subset: {len(df):,} rows")

# --- Column detection ---
text_candidates  = ["comment_text","text","comment"]
label_candidates = ["toxic","toxicity","target","label","y"]
text_col  = next((c for c in text_candidates if c in df.columns), None)
label_col = next((c for c in label_candidates if c in df.columns), None)
if text_col is None or label_col is None:
    raise KeyError("Set text and label columns explicitly (text_col / label_col).")

# --- Sensitive attribute (gender/sex/etc) detection ---
sens_cols = [c for c in df.columns if any(k in c.lower() for k in ["gender","sex","male","female","identity"])]
if sens_cols:
    s = (df[sens_cols[0]].astype(str).str.lower().str.contains("female")).astype(int).values
else:
    s = np.zeros(len(df), dtype=int)  # fallback if not available

# --- Target & text ---
y = (pd.to_numeric(df[label_col], errors="coerce").fillna(0) >= 0.5).astype(int).values
X_text = df[text_col].astype(str).fillna("")

# --- Split & Vectorize ---
Xtr_txt, Xte_txt, ytr, yte, gtr, gte = train_test_split(X_text, y, s, test_size=0.2, random_state=42, stratify=y)
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2))
Xtr = tfidf.fit_transform(Xtr_txt)
Xte = tfidf.transform(Xte_txt)

# --- Logistic Regression baseline ---
clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(Xtr, ytr)
ys = clf.predict_proba(Xte)[:,1]
yp = (ys >= 0.5).astype(int)

# --- Evaluate + Log ---
m = evaluate_binary(yte, ys, yp, gte)
add_summary_row("civilcomments", "tfidf_logreg_baseline", m)
print("‚úÖ CivilComments baseline metrics:", m)


Loaded subset: 50,000 rows
‚úÖ CivilComments baseline metrics: {'accuracy': 0.9392, 'auc': 0.8929514752686625, 'eo_tpr_gap': 0.0, 'eqodds_tpr_gap': 0.0, 'eqodds_tnr_gap': 0.0}


## 8) CelebA / FairFace ‚Äî Vision Fairness Benchmark


In [10]:

import pandas as pd, numpy as np, os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

FAIRFACE = Path("/content/data/fairface")
label = FAIRFACE/"fairface_label_val.csv"
if not label.exists():
    print("FairFace labels missing ‚Äî re-run Setup (Section 2).")
else:
    df = pd.read_csv(label)
    sens_col = "gender" if "gender" in df.columns else None
    if sens_col is None:
        print("Unexpected FairFace columns:", df.columns.tolist())
    else:
        y = (df["race"].astype(str).str.lower().str.contains("white")).astype(int) if "race" in df.columns else (df[sens_col].astype(str).str.lower().str.contains("female")).astype(int)
        s = (df[sens_col].astype(str).str.lower().str.contains("female")).astype(int)
        proxy = pd.get_dummies(df.select_dtypes(include="object").fillna("NA")).astype(int)
        Xtr, Xte, ytr, yte, str_, ste = train_test_split(proxy, y, s, test_size=0.2, random_state=42, stratify=y)
        clf = LogisticRegression(max_iter=500)
        clf.fit(Xtr, ytr)
        ys = clf.predict_proba(Xte)[:,1]; yp = (ys>=0.5).astype(int)
        m = evaluate_binary(yte, ys, yp, ste)
        add_summary_row("fairface_proxy", "logreg_baseline", m)
        print("FairFace proxy:", m)


FairFace labels missing ‚Äî re-run Setup (Section 2).


## 9) Aggregated Results & Final Export


In [11]:

import os, pandas as pd
summary_path = flush_summary_csv("results_summary_all.csv")
print("Master summary:", summary_path)

df = pd.read_csv(summary_path)
for col in ["accuracy","auc","eo_tpr_gap","eqodds_tpr_gap","eqodds_tnr_gap"]:
    if col in df.columns: df[col] = pd.to_numeric(df[col], errors="coerce")

def pick_best(group):
    g = group.copy()
    g["eo_tpr_gap"] = g["eo_tpr_gap"].fillna(1e9)
    return g.sort_values(by=["accuracy","eo_tpr_gap"], ascending=[False, True]).iloc[0]

best = df.groupby("dataset", as_index=False).apply(pick_best).reset_index(drop=True)
best.to_csv(os.path.join(EXPORT_DIR, "summary_best.csv"), index=False)

# Overleaf & Markdown
tex = ["% Auto-generated (AIM460)\n","\\begin{table}[h]\n\\centering\n","\\begin{tabular}{l l r r r}\n","\\hline\n","Dataset & Method & Accuracy & AUC & EO Gap \\ \n","\\hline\n"]
for _,r in best.iterrows():
    acc = f"{r['accuracy']:.3f}" if pd.notna(r['accuracy']) else "NA"
    auc = f"{r['auc']:.3f}" if pd.notna(r['auc']) else "NA"
    eo  = f"{r['eo_tpr_gap']:.3f}" if pd.notna(r['eo_tpr_gap']) else "NA"
    tex.append(f"{r['dataset']} & {r['method']} & {acc} & {auc} & {eo} \\ \n")
tex += ["\\hline\n","\\end{tabular}\n","\\caption{Best-performing method per dataset.}\n","\\end{table}\n"]
open(os.path.join(EXPORT_DIR, "overleaf_snippet.tex"),"w").writelines(tex)

md = ["| Dataset | Method | Accuracy | AUC | EO Gap |\n","|---|---|---:|---:|---:|\n"]
for _,r in best.iterrows():
    acc = f"{r['accuracy']:.3f}" if pd.notna(r['accuracy']) else "NA"
    auc = f"{r['auc']:.3f}" if pd.notna(r['auc']) else "NA"
    eo  = f"{r['eo_tpr_gap']:.3f}" if pd.notna(r['eo_tpr_gap']) else "NA"
    md.append(f"| {r['dataset']} | {r['method']} | {acc} | {auc} | {eo} |\n")
open(os.path.join(EXPORT_DIR, "overleaf_md.md"),"w").writelines(md)

print("Wrote:", os.path.join(EXPORT_DIR,"summary_best.csv"))
print("Wrote:", os.path.join(EXPORT_DIR,"overleaf_snippet.tex"))
print("Wrote:", os.path.join(EXPORT_DIR,"overleaf_md.md"))


Master summary: /content/exports/results_summary_all.csv
Wrote: /content/exports/summary_best.csv
Wrote: /content/exports/overleaf_snippet.tex
Wrote: /content/exports/overleaf_md.md


In [12]:
# Verify the consolidated summary and preview the top rows
import pandas as pd, os

EXPORT_DIR = "/content/exports"
summary_path = os.path.join(EXPORT_DIR, "results_summary_all.csv")
assert os.path.exists(summary_path), f"Missing {summary_path} ‚Äî be sure Sections 2‚Äì7 ran."

df = pd.read_csv(summary_path)
# Make sure numeric cols are numeric
for col in ["accuracy","auc","eo_tpr_gap","eqodds_tpr_gap","eqodds_tnr_gap"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

print(f"‚úÖ Found summary with {len(df)} rows at: {summary_path}")
display(df.head(10))
print("\nDatasets present:", sorted(df['dataset'].unique()))


‚úÖ Found summary with 4 rows at: /content/exports/results_summary_all.csv


Unnamed: 0,dataset,method,accuracy,auc,eo_tpr_gap,eqodds_tpr_gap,eqodds_tnr_gap
0,acs_income,logreg_baseline,0.786625,0.861349,0.744844,0.744844,0.191288
1,compas,logreg_baseline,0.982675,0.998343,0.007326,0.007326,0.01366
2,fairjob,logreg_baseline,0.992968,0.781297,0.0,0.0,9.4e-05
3,civilcomments,tfidf_logreg_baseline,0.9392,0.892951,0.0,0.0,0.0



Datasets present: ['acs_income', 'civilcomments', 'compas', 'fairjob']


In [13]:
# Build best-per-dataset table and write Overleaf/Markdown snippets
import pandas as pd, os

EXPORT_DIR = "/content/exports"
summary_path = os.path.join(EXPORT_DIR, "results_summary_all.csv")
df = pd.read_csv(summary_path)

for col in ["accuracy","auc","eo_tpr_gap","eqodds_tpr_gap","eqodds_tnr_gap"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

def pick_best(group):
    g = group.copy()
    # prefer high accuracy, then lower EO gap
    g["eo_tpr_gap"] = g["eo_tpr_gap"].fillna(1e9)
    return g.sort_values(by=["accuracy","eo_tpr_gap"], ascending=[False, True]).iloc[0]

best = df.groupby("dataset", as_index=False).apply(pick_best).reset_index(drop=True)
best_path = os.path.join(EXPORT_DIR, "summary_best.csv")
best.to_csv(best_path, index=False)
print("‚úÖ Best-per-dataset saved to:", best_path)
display(best)

# Overleaf TeX
tex = [
    "% Auto-generated table (AIM460)\n",
    "\\begin{table}[h]\n\\centering\n",
    "\\begin{tabular}{l l r r r}\n\\hline\n",
    "Dataset & Method & Accuracy & AUC & EO Gap \\\\ \n\\hline\n"
]
for _, r in best.iterrows():
    acc = ("%.3f" % r["accuracy"]) if pd.notna(r["accuracy"]) else "NA"
    auc = ("%.3f" % r["auc"]) if ("auc" in r and pd.notna(r["auc"])) else "NA"
    eo  = ("%.3f" % r["eo_tpr_gap"]) if ("eo_tpr_gap" in r and pd.notna(r["eo_tpr_gap"])) else "NA"
    tex.append(f"{r['dataset']} & {r['method']} & {acc} & {auc} & {eo} \\\\ \n")
tex += ["\\hline\n","\\end{tabular}\n","\\caption{Best-performing method per dataset.}\n","\\end{table}\n"]
with open(os.path.join(EXPORT_DIR, "overleaf_snippet.tex"), "w") as f:
    f.writelines(tex)

# Markdown
md = ["| Dataset | Method | Accuracy | AUC | EO Gap |\n","|---|---|---:|---:|---:|\n"]
for _, r in best.iterrows():
    acc = ("%.3f" % r["accuracy"]) if pd.notna(r["accuracy"]) else "NA"
    auc = ("%.3f" % r["auc"]) if ("auc" in r and pd.notna(r["auc"])) else "NA"
    eo  = ("%.3f" % r["eo_tpr_gap"]) if ("eo_tpr_gap" in r and pd.notna(r["eo_tpr_gap"])) else "NA"
    md.append(f"| {r['dataset']} | {r['method']} | {acc} | {auc} | {eo} |\n")
with open(os.path.join(EXPORT_DIR, "overleaf_md.md"), "w") as f:
    f.writelines(md)

print("üìù Wrote:", os.path.join(EXPORT_DIR, "overleaf_snippet.tex"))
print("üìù Wrote:", os.path.join(EXPORT_DIR, "overleaf_md.md"))


‚úÖ Best-per-dataset saved to: /content/exports/summary_best.csv


Unnamed: 0,dataset,method,accuracy,auc,eo_tpr_gap,eqodds_tpr_gap,eqodds_tnr_gap
0,acs_income,logreg_baseline,0.786625,0.861349,0.744844,0.744844,0.191288
1,civilcomments,tfidf_logreg_baseline,0.9392,0.892951,0.0,0.0,0.0
2,compas,logreg_baseline,0.982675,0.998343,0.007326,0.007326,0.01366
3,fairjob,logreg_baseline,0.992968,0.781297,0.0,0.0,9.4e-05


üìù Wrote: /content/exports/overleaf_snippet.tex
üìù Wrote: /content/exports/overleaf_md.md


In [15]:
# Generate Overleaf-ready charts from results_summary_all.csv
import os, pandas as pd, numpy as np
import matplotlib.pyplot as plt

EXPORT_DIR = "/content/exports"
summary_path = os.path.join(EXPORT_DIR, "results_summary_all.csv")
assert os.path.exists(summary_path), f"Missing {summary_path}"

df = pd.read_csv(summary_path)
for col in ["accuracy","auc","eo_tpr_gap","eqodds_tpr_gap","eqodds_tnr_gap"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

os.makedirs(EXPORT_DIR, exist_ok=True)

# 1) Per-dataset tradeoff plots: Accuracy (‚Üë) vs EO gap (‚Üì)
def tradeoff_plot(data, dataset):
    d = data[data["dataset"] == dataset].dropna(subset=["accuracy","eo_tpr_gap"])
    if d.empty: return None
    plt.figure(figsize=(5.2, 4.0))
    plt.scatter(d["accuracy"], d["eo_tpr_gap"], s=48)
    for _, r in d.iterrows():
        if isinstance(r.get("method"), str):
            plt.annotate(r["method"], (r["accuracy"], r["eo_tpr_gap"]), fontsize=8)
    plt.xlabel("Accuracy (‚Üë)")
    plt.ylabel("EO TPR Gap (‚Üì)")
    plt.title(f"{dataset}: Utility vs Fairness")
    plt.grid(True, alpha=0.3)
    out = os.path.join(EXPORT_DIR, f"{dataset}_tradeoff.png")
    plt.tight_layout(); plt.savefig(out, dpi=200); plt.close()
    return out

# 2) Best-per-dataset bar charts of metrics (Accuracy, AUC, EO gap)
def best_bar(best_row):
    dataset = best_row["dataset"]
    acc = best_row.get("accuracy", np.nan)
    auc = best_row.get("auc", np.nan)
    eo  = best_row.get("eo_tpr_gap", np.nan)

    labels = ["Accuracy", "AUC", "EO Gap"]
    vals   = [acc, auc, eo]

    plt.figure(figsize=(5.2, 3.4))
    x = np.arange(len(labels))
    plt.bar(x, vals)
    plt.xticks(x, labels)
    plt.title(f"{dataset}: Best Method Metrics")
    # Annotate bars
    for i, v in enumerate(vals):
        txt = "NA" if pd.isna(v) else f"{v:.3f}"
        plt.text(i, (0 if pd.isna(v) else v) + (0.005 if not pd.isna(v) else 0.02), txt, ha="center", va="bottom", fontsize=9)
    plt.tight_layout()
    out = os.path.join(EXPORT_DIR, f"{dataset}_best_metrics.png")
    plt.savefig(out, dpi=200); plt.close()
    return out

# Build best table (reuse if you already created summary_best.csv)
def pick_best(group):
    g = group.copy()
    g["eo_tpr_gap"] = g["eo_tpr_gap"].fillna(1e9)
    return g.sort_values(by=["accuracy","eo_tpr_gap"], ascending=[False, True]).iloc[0]

best = df.groupby("dataset", as_index=False).apply(pick_best).reset_index(drop=True)
best_path = os.path.join(EXPORT_DIR, "summary_best.csv")
best.to_csv(best_path, index=False)

# Generate all images
made = []
for ds in sorted(df["dataset"].dropna().unique()):
    p1 = tradeoff_plot(df, ds)
    if p1: made.append(p1)

for _, row in best.iterrows():
    p2 = best_bar(row)
    made.append(p2)

print("‚úÖ Charts written:")
for p in made:
    print("  -", p)


‚úÖ Charts written:
  - /content/exports/acs_income_tradeoff.png
  - /content/exports/civilcomments_tradeoff.png
  - /content/exports/compas_tradeoff.png
  - /content/exports/fairjob_tradeoff.png
  - /content/exports/acs_income_best_metrics.png
  - /content/exports/civilcomments_best_metrics.png
  - /content/exports/compas_best_metrics.png
  - /content/exports/fairjob_best_metrics.png


In [16]:
# Pack notebook + exports for GitHub / Overleaf
import os, shutil, zipfile, time, glob, json, pathlib

NB_BASENAME = "AIM460_Final"  # change if you want
EXPORT_DIR = "/content/exports"
PKG_DIR = f"/content/{NB_BASENAME}_release"
ZIP_PATH = f"/content/{NB_BASENAME}_release_{int(time.time())}.zip"

os.makedirs(PKG_DIR, exist_ok=True)
os.makedirs(EXPORT_DIR, exist_ok=True)

# 1) Copy notebook (this notebook) into package root
# In Colab, this grabs the current notebook path via JS; fallback: manually upload later
try:
    from google.colab import output
    nb_name = None
    def _capture_name(n):
        global nb_name; nb_name = n
    output.register_callback('nb_name_cb', _capture_name)
    display_javascript = """
    async function f(){
      const el = document.querySelector('colab-toolbar-menu-button[menu-item="file-name"]');
      const name = el?.shadowRoot?.querySelector('#file-name')?.value || null;
      google.colab.kernel.invokeFunction('nb_name_cb', [name], {});
    }
    f();
    """
    from IPython.display import Javascript
    display(Javascript(display_javascript))
except Exception:
    nb_name = None

# Fallback: look for a likely filename in /content
if not nb_name:
    candidates = [p for p in os.listdir("/content") if p.endswith(".ipynb")]
    candidates.sort(key=lambda p: os.path.getmtime(f"/content/{p}"), reverse=True)
    nb_name = candidates[0] if candidates else None

if nb_name and os.path.exists(f"/content/{nb_name}"):
    shutil.copy(f"/content/{nb_name}", f"{PKG_DIR}/{NB_BASENAME}.ipynb")
else:
    print("‚ö†Ô∏è Could not auto-copy notebook. Upload it manually into the repo later.")

# 2) Copy exports (CSVs + PNGs + snippets)
for p in glob.glob(f"{EXPORT_DIR}/*"):
    dest = f"{PKG_DIR}/exports/{os.path.basename(p)}"
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    shutil.copy(p, dest)

# 3) Create README and .gitignore
readme = f"""# {NB_BASENAME}

AIM460 Fairness Benchmark ‚Äî final submission bundle.

## Contents
- `./{NB_BASENAME}.ipynb` ‚Äî final runnable notebook
- `./exports/` ‚Äî aggregated results & figures:
  - `results_summary_all.csv`, `summary_best.csv`
  - `*_tradeoff.png`, `*_best_metrics.png`
  - `overleaf_snippet.tex`, `overleaf_md.md`

## Re-run (Colab)
1. Run Section 2 (Setup)
2. Run Section 3 (Utilities)
3. Run: ACSIncome, COMPAS, CivilComments, FairJob (as available)
4. Run final Aggregation & Charts cells

## Data
- Do **not** commit raw datasets. Use the notebook to fetch/download as needed.
"""

with open(f"{PKG_DIR}/README.md","w") as f: f.write(readme)

gitignore = """# Ignore data and caches
data/
datasets/
*.csv
*.zip
*.gz
__pycache__/
.ipynb_checkpoints/
"""
with open(f"{PKG_DIR}/.gitignore","w") as f: f.write(gitignore)

# 4) Zip it
with zipfile.ZipFile(ZIP_PATH, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk(PKG_DIR):
        for name in files:
            p = os.path.join(root, name)
            z.write(p, os.path.relpath(p, PKG_DIR))

print("‚úÖ Release folder:", PKG_DIR)
print("‚úÖ Zip ready:", ZIP_PATH)


<IPython.core.display.Javascript object>

‚ö†Ô∏è Could not auto-copy notebook. Upload it manually into the repo later.
‚úÖ Release folder: /content/AIM460_Final_release
‚úÖ Zip ready: /content/AIM460_Final_release_1762747336.zip


## 10) References
- Krco et al. (2023) FRAME
- Seth et al. (2023) DeAR
- Li et al. (2024) Explainable Debiasing
- Dom√≠nguez-Catena et al. (2025) DSAP
- Islam et al. (2024) SLSD
- Datasets: ProPublica COMPAS, FairFace, CivilComments, Folktables ACSIncome
