In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, glob, json
import numpy as np
import pandas as pd
from dataclasses import dataclass

# ✅ 너의 실제 골든셋 폴더 경로로 통일
GOLDEN_DIR = "/content/drive/MyDrive/TRAITHON_EXP/datasets/golden_set_v1_TL"
OUTPUT_DIR = "/content/drive/MyDrive/TRAITHON_EXP/results/section_bias_experiment"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ----------------------------
# (A) 1) 골든셋 로드
# ----------------------------
def load_golden_jsons(golden_dir: str, max_n: int = 5000):
    paths = sorted(glob.glob(os.path.join(golden_dir, "*.json")))
    if len(paths) == 0:
        raise FileNotFoundError(f"No .json files found in: {golden_dir}")
    if len(paths) < max_n:
        print(f"[WARN] Found only {len(paths)} json files (<{max_n}). Using all found files.")
        max_n = len(paths)
    paths = paths[:max_n]

    rows = []
    for p in paths:
        with open(p, "r", encoding="utf-8") as f:
            obj = json.load(f)

        sdi = obj.get("sourceDataInfo", {})
        section = sdi.get("newsCategory", None)
        if section is None:
            section = sdi.get("category", None)
        if section is None:
            raise KeyError(f"Cannot find newsCategory in file: {p}")

        rows.append({"path": p, "section": section})

    return pd.DataFrame(rows)

# ----------------------------
# (B) 2) 노출량 샘플링(가우시안) - "정규화 안 함"
# ----------------------------
@dataclass
class ExposureGaussianSpec:
    mean_by_section: dict
    std_by_section: dict
    min_clip: float = 0.0

def sample_exposures_gaussian(section_series: pd.Series, seed: int, spec: ExposureGaussianSpec) -> np.ndarray:
    rng = np.random.default_rng(seed)
    exposures = np.zeros(len(section_series), dtype=np.float64)

    for i, sec in enumerate(section_series):
        mu = spec.mean_by_section.get(sec, None)
        sd = spec.std_by_section.get(sec, None)
        if mu is None or sd is None:
            raise KeyError(f"Exposure spec missing for section='{sec}'. Add mean/std for this section.")
        x = rng.normal(loc=mu, scale=sd)
        if spec.min_clip is not None:
            x = max(spec.min_clip, x)
        exposures[i] = x

    return exposures

def sum_exposures_by_section(section_series: pd.Series, exposures: np.ndarray) -> pd.Series:
    df = pd.DataFrame({"section": section_series.values, "exposure": exposures})
    return df.groupby("section")["exposure"].sum()

# ----------------------------
# (C) 3) 섹션별 FP rate 붙여 "노출 가중 FP 기여도" 계산
# ----------------------------
def exposure_weighted_fp(exposure_sum_by_section: pd.Series, fp_rate_by_section: dict) -> pd.Series:
    out = {}
    for sec, exp_sum in exposure_sum_by_section.items():
        if sec not in fp_rate_by_section:
            raise KeyError(f"FP rate missing for section='{sec}'. Provide fp_rate_by_section.")
        out[sec] = exp_sum * fp_rate_by_section[sec]
    return pd.Series(out).sort_index()

# ----------------------------
# (D) 4) 위험 판단 규칙(임계치/갭)
# ----------------------------
@dataclass
class RiskRule:
    absolute_gap_threshold: float
    ratio_gap_threshold: float
    eps: float = 1e-9

def judge_risk(weighted_fp_by_section: pd.Series, rule: RiskRule) -> dict:
    mx = float(weighted_fp_by_section.max())
    mn = float(weighted_fp_by_section.min())
    abs_gap = mx - mn
    ratio_gap = mx / (mn + rule.eps)

    abs_fail = abs_gap > rule.absolute_gap_threshold
    ratio_fail = ratio_gap > rule.ratio_gap_threshold

    return {
        "abs_gap": abs_gap,
        "ratio_gap": ratio_gap,
        "fail": (abs_fail or ratio_fail),
        "max_section": weighted_fp_by_section.idxmax(),
        "min_section": weighted_fp_by_section.idxmin(),
    }

# ----------------------------
# (E) 5) seed 여러 개 + 안정성(k=2) 검증
# ----------------------------
def run_experiment(
    golden_dir: str,
    fp_rate_by_section: dict,
    exposure_spec: ExposureGaussianSpec,
    risk_rule: RiskRule,
    seeds: list,
    max_n: int = 5000,
    k: int = 2
):
    df = load_golden_jsons(golden_dir, max_n=max_n)
    section_series = df["section"]

    results = []
    for seed in seeds:
        exposures = sample_exposures_gaussian(section_series, seed=seed, spec=exposure_spec)
        exp_sum = sum_exposures_by_section(section_series, exposures)
        wfp = exposure_weighted_fp(exp_sum, fp_rate_by_section)
        verdict = judge_risk(wfp, risk_rule)

        results.append({
            "seed": seed,
            "abs_gap": verdict["abs_gap"],
            "ratio_gap": verdict["ratio_gap"],
            "fail": verdict["fail"],
            "max_section": verdict["max_section"],
            "min_section": verdict["min_section"],
        })

    res_df = pd.DataFrame(results).sort_values("seed").reset_index(drop=True)
    stable_fail = (res_df["fail"].sum() >= k)

    summary = {
        "n_runs": len(seeds),
        "n_fail": int(res_df["fail"].sum()),
        "k": k,
        "stable_fail": stable_fail,
        "abs_gap_mean": float(res_df["abs_gap"].mean()),
        "abs_gap_p95": float(np.percentile(res_df["abs_gap"], 95)),
        "ratio_gap_mean": float(res_df["ratio_gap"].mean()),
        "ratio_gap_p95": float(np.percentile(res_df["ratio_gap"], 95)),
    }

    return df, res_df, summary

# ----------------------------
# (F) 실행부
# ----------------------------

# 1) 섹션별 FP rate (이미지 기반 값)
fp_rate_by_section = {
    "정치":  0.02542,
    "경제": 0.01036,
    "사회": 0.0147,
    "생활&문화": 0.03436,
    "IT&과학": 0.00702,
    "세계": 0.00845,
    "연예": 0.01316
}

# 2) 골든셋에서 섹션 분포 읽기
df_meta = load_golden_jsons(GOLDEN_DIR, max_n=5000)
section_series = df_meta["section"]

print("Section counts:")
print(section_series.value_counts())
print(section_series.value_counts().sum())

# 3) mean_by_section: '기사 수 기반 상대 가중치' × BASE_MEAN_EXPOSURE (정당화 강함)
section_counts = section_series.value_counts().to_dict()
mean_count = sum(section_counts.values()) / len(section_counts)

BASE_MEAN_EXPOSURE = 1000.0  # 스케일(원하면 바꿔도 됨)
mean_by_section = {sec: (cnt / mean_count) * BASE_MEAN_EXPOSURE for sec, cnt in section_counts.items()}

# 4) std_by_section: 섹션 공통(편향 주입 최소화)
STD_RATIO = 0.30
STD_GLOBAL = BASE_MEAN_EXPOSURE * STD_RATIO
std_by_section = {sec: STD_GLOBAL for sec in mean_by_section.keys()}

exposure_spec = ExposureGaussianSpec(
    mean_by_section=mean_by_section,
    std_by_section=std_by_section,
    min_clip=0.0
)

print("\nExposure mean_by_section (scaled by counts):")
print(mean_by_section)
print("Exposure std global:", STD_GLOBAL)

# 5) 위험 임계치
baseline_seeds = list(range(1, 51))  #P95 안정화를 위해 50회 권장(원하면 30~200) -> 기준 설정을 위한 시드
dummy_rule = RiskRule(
    absolute_gap_threshold=float("inf"),   # 절대갭은 baseline 산정에서 무시
    ratio_gap_threshold=float("inf")       # ratio도 일단 무시(우리가 분포만 뽑을 거라)
)

_, baseline_df, _ = run_experiment(
    golden_dir=GOLDEN_DIR,
    fp_rate_by_section=fp_rate_by_section,
    exposure_spec=exposure_spec,
    risk_rule=dummy_rule,
    seeds=baseline_seeds,
    max_n=5000,
    k=2
)

tau_ratio = float(np.percentile(baseline_df["ratio_gap"].values, 95))
tau_abs   = float(np.percentile(baseline_df["abs_gap"].values, 95))

print("\n=== Baseline threshold (P95) ===")
print("tau_ratio(P95):", tau_ratio)
print("tau_abs(P95):", tau_abs)

risk_rule = RiskRule(
    absolute_gap_threshold=float("inf"),        # abs은 경보 트리거에서 제외, 나중에 실제 운영 상황에서 필요하다면 사용
    ratio_gap_threshold=tau_ratio
)

# 6) seed 30개 + k=2 -> baseline seed와 겹치지 않는 것으로 설정
eval_seeds = [
    101, 127, 149, 193, 217,
    221, 247, 271, 301, 393,
    397, 371, 417, 441, 469,
    493, 519, 523, 571, 597,
    619, 651, 681, 711, 741,
    771, 801, 829, 861, 897
]
df_meta2, per_seed_df, summary = run_experiment(
    golden_dir=GOLDEN_DIR,
    fp_rate_by_section=fp_rate_by_section,
    exposure_spec=exposure_spec,
    risk_rule=risk_rule,
    seeds=eval_seeds,
    max_n=5000,
    k=2
)

print("\n=== Summary ===")
print(summary)
print("\n=== Per-seed results ===")
print(per_seed_df)

# 결과물 csv 파일 만들기
# 1. Eval seed별 결과
eval_result_path = os.path.join(
    OUTPUT_DIR,
    "eval_exposure_weighted_fp_gap_by_seed.csv"
)
per_seed_df.to_csv(eval_result_path, index=False, encoding="utf-8-sig")
print(f"[SAVED] Eval per-seed results → {eval_result_path}")

# 2. Baseline 분포(P95 산정 근거) 저장
baseline_result_path = os.path.join(
    OUTPUT_DIR,
    "baseline_ratio_gap_distribution.csv"
)
baseline_df.to_csv(baseline_result_path, index=False, encoding="utf-8-sig")
print(f"[SAVED] Baseline distribution → {baseline_result_path}")

# 3. Summary(한 줄 요약)도 CSV로 저장 (보고서용)
summary_df = pd.DataFrame([summary])
summary_path = os.path.join(
    OUTPUT_DIR,
    "experiment_summary.csv"
)
summary_df.to_csv(summary_path, index=False, encoding="utf-8-sig")
print(f"[SAVED] Experiment summary → {summary_path}")

Mounted at /content/drive
Section counts:
section
사회       1005
세계        772
경제        729
정치        715
IT&과학     645
연예        598
생활&문화     536
Name: count, dtype: int64
5000

Exposure mean_by_section (scaled by counts):
{'사회': 1406.9999999999998, '세계': 1080.8, '경제': 1020.5999999999999, '정치': 1000.9999999999999, 'IT&과학': 902.9999999999999, '연예': 837.1999999999999, '생활&문화': 750.4}
Exposure std global: 300.0

=== Baseline threshold (P95) ===
tau_ratio(P95): 5.208998088394226
tau_abs(P95): 16963.39516834698

=== Summary ===
{'n_runs': 30, 'n_fail': 3, 'k': 2, 'stable_fail': np.True_, 'abs_gap_mean': 16708.781624772266, 'abs_gap_p95': 16939.514675909424, 'ratio_gap_mean': 5.09585807954303, 'ratio_gap_p95': 5.229005091554665}

=== Per-seed results ===
    seed       abs_gap  ratio_gap   fail max_section min_section
0    111  16559.515276   5.030147  False          사회       IT&과학
1    137  16917.013705   5.221068   True          사회       IT&과학
2    159  16661.680964   5.067454  False    