In [1]:
import os
import pandas as pd
import numpy as np
from math import erf

# -------- INPUT FILES --------
INPUTS = {
    "DALL·E 3": "gpt.csv",
    "Gemini Imagen 4.0": "gemini.csv",
    "FLUX.1-dev": "fal.csv",
    "Stable Diffusion XL Turbo": "sdxl.csv",
    "Grok-2 Image": "grok.csv",
}

OCCUPATIONS = ["CEO", "Nurse", "SWE", "Teacher", "Athlete"]
RACES = ["Asian", "Black", "Hispanic", "White"]

# -------- NORMALIZERS --------
def norm_condition(x):
    if pd.isna(x): return None
    s = str(x).strip().lower()
    if "base" in s or s in {"b","0","baseline","neutral","default"}:
        return "Baseline"
    if "control" in s or "prompt" in s or s in {"c","1","controlled","intervention","balanced"}:
        return "Controlled"
    return None

def norm_race(x):
    if pd.isna(x): return None
    s = str(x).strip().lower()
    if "asian" in s: return "Asian"
    if "black" in s or "african" in s: return "Black"
    if "hispanic" in s or "latino" in s: return "Hispanic"
    if "white" in s or "caucasian" in s: return "White"
    return None

def norm_gender(x):
    if pd.isna(x): return None
    s = str(x).strip().lower()
    if s.startswith("f"): return "F"
    if s.startswith("m"): return "M"
    return None

def norm_occ(x):
    if pd.isna(x): return None
    s = str(x).strip().lower()
    if "ceo" in s: return "CEO"
    if "nurse" in s: return "Nurse"
    if "swe" in s or "software" in s or "engineer" in s: return "SWE"
    if "teacher" in s or "prof" in s or "instructor" in s: return "Teacher"
    if "athlete" in s or "player" in s or "runner" in s: return "Athlete"
    return None

def chi2_p_2x2(a,b,c,d):
    total = a+b+c+d
    if total == 0: return np.nan
    r1, r2 = a+b, c+d
    c1, c2 = a+c, b+d
    e = [r1*c1/total, r1*c2/total, r2*c1/total, r2*c2/total]
    o = [a,b,c,d]
    chi2 = sum(((oi-ei)**2/ei) for oi,ei in zip(o,e) if ei>0)
    return 1 - erf((chi2/2)**0.5)

# -------- RESULTS --------
race_rows, gender_rows = [], []

for model, path in INPUTS.items():
    df = pd.read_csv(path)

    df["Occupation"] = df["category"].apply(norm_occ)
    df["Race"] = df["race"].apply(norm_race)
    df["Gender"] = df["gender"].apply(norm_gender)
    df["Condition"] = df["setting"].apply(norm_condition)

    # --- Race/Ethnicity ---
    keep_r = df.dropna(subset=["Occupation","Condition","Race"])
    for occ in OCCUPATIONS:
        b = keep_r[(keep_r.Occupation==occ)&(keep_r.Condition=="Baseline")]["Race"].value_counts()
        c = keep_r[(keep_r.Occupation==occ)&(keep_r.Condition=="Controlled")]["Race"].value_counts()
        b_counts = [int(b.get(r,0)) for r in RACES]
        c_counts = [int(c.get(r,0)) for r in RACES]
        b_tot, c_tot = sum(b_counts), sum(c_counts)
        b_nw, c_nw = sum(b_counts[:3]), sum(c_counts[:3])
        b_pct = 100*b_nw/b_tot if b_tot>0 else np.nan
        c_pct = 100*c_nw/c_tot if c_tot>0 else np.nan
        delta = c_pct - b_pct if (not np.isnan(b_pct) and not np.isnan(c_pct)) else np.nan
        p_val = chi2_p_2x2(b_nw, b_counts[3], c_nw, c_counts[3]) if b_tot>0 and c_tot>0 else np.nan

        race_rows.append([model, occ, "/".join(map(str,b_counts)), "/".join(map(str,c_counts)), b_pct, c_pct, delta, p_val])

    # --- Gender ---
    keep_g = df.dropna(subset=["Occupation","Condition","Gender"])
    keep_g = keep_g[keep_g["Gender"].isin(["F","M"])]
    for occ in OCCUPATIONS:
        b = keep_g[(keep_g.Occupation==occ)&(keep_g.Condition=="Baseline")]["Gender"].value_counts()
        c = keep_g[(keep_g.Occupation==occ)&(keep_g.Condition=="Controlled")]["Gender"].value_counts()
        bF,bM = int(b.get("F",0)), int(b.get("M",0))
        cF,cM = int(c.get("F",0)), int(c.get("M",0))
        b_tot,c_tot = bF+bM, cF+cM
        b_pctF = 100*bF/b_tot if b_tot>0 else np.nan
        c_pctF = 100*cF/c_tot if c_tot>0 else np.nan
        delta = c_pctF-b_pctF if (not np.isnan(b_pctF) and not np.isnan(c_pctF)) else np.nan
        p_val = chi2_p_2x2(bF,bM,cF,cM) if b_tot>0 and c_tot>0 else np.nan

        gender_rows.append([model, occ,
                            "--" if b_tot==0 else f"{bF}/{bM} ({b_pctF:.0f}%)",
                            "--" if c_tot==0 else f"{cF}/{cM} ({c_pctF:.0f}%)",
                            delta, p_val])

# -------- SAVE --------
race_df = pd.DataFrame(race_rows, columns=["Model","Occupation","Baseline (A/B/H/W)","Controlled (A/B/H/W)","Baseline %NonWhite","Controlled %NonWhite","ΔNonWhite","p-value"])
gender_df = pd.DataFrame(gender_rows, columns=["Model","Occupation","Baseline (F/M, %F)","Controlled (F/M, %F)","ΔF","p-value"])

race_df.to_excel("race_ethnicity_results.xlsx", index=False)
gender_df.to_excel("gender_results.xlsx", index=False)

print("Saved -> race_ethnicity_results.xlsx")
print("Saved -> gender_results.xlsx")


Saved -> race_ethnicity_results.xlsx
Saved -> gender_results.xlsx
