In [6]:
import os
import math
import pandas as pd
import json
import numpy as np
import torch
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

from scipy.stats import skew, kurtosis
from sklearn.feature_extraction.text import TfidfVectorizer
from attack_text import *
from utils.metrics_utils import Metrics


os.environ["CUDA_VISIBLE_DEVICES"] = "3"
torch.cuda.is_available()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
llm_path_7b = "/home/wanghuili/MIA-Spector/models/llama/llama-13b-hf"
tokenizer_7b = AutoTokenizer.from_pretrained(llm_path_7b, trust_remote_code=True)
model_7b = AutoModelForCausalLM.from_pretrained(llm_path_7b, device_map="auto", trust_remote_code=True)

Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.60s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


## get the thres

In [7]:
DATA_NAME = "WikiMIA_length32"
OUT_JSON = f"llama_data/threshold_{DATA_NAME}.json"

In [8]:
# ======= 配置 =======
MAX_SAMPLES = 100
MINKPP_RATIO = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
ALPHA = 0.01                          # 目标 FPR（用于 threshold_fpr_alpha）

# 键名工具：规避浮点字符串不一致
def rstr(r: float) -> str:
    return f"{r:.1f}"

# 组合出所有要统计的键
keys_minkpp   = [f"mink++_{rstr(r)}" for r in MINKPP_RATIO]
keys_mink     = [f"mink_{rstr(r)}"   for r in MINKPP_RATIO]

scores_perplexity_1 = [f"perplexity_1_mink++_{rstr(r)}" for r in MINKPP_RATIO]
scores_perplexity_2 = [f"perplexity_2_mink++_{rstr(r)}" for r in MINKPP_RATIO]
scores_perplexity_3 = [f"perplexity_3_mink++_{rstr(r)}" for r in MINKPP_RATIO]
scores_perplexity_4 = [f"perplexity_4_mink++_{rstr(r)}" for r in MINKPP_RATIO]

ppl_scores_label = [
    "perplexity_variance", "perplexity_std", "perplexity_range",
    "perplexity_skewness", "perplexity_kurtosis"
]

# ======= 初始化 =======
cal = ScoreCalculator(model=model_7b, tokenizer=tokenizer_7b)
data_wiki128 = DatasetLoader.load_dataset(f"{DATA_NAME}", MAX_SAMPLES)

# 容器
scores_by_ratio = {k: [] for k in (keys_minkpp + keys_mink)}
scores_by_ppl = {k: [] for k in ppl_scores_label}
scores_by_ppl_1 = {k: [] for k in scores_perplexity_1}
scores_by_ppl_2 = {k: [] for k in scores_perplexity_2}
scores_by_ppl_3 = {k: [] for k in scores_perplexity_3}
scores_by_ppl_4 = {k: [] for k in scores_perplexity_4}
labels = []

def safe_get(d: dict, k: str, default=np.nan):
    return d[k] if k in d else default

# ======= 打分 =======
for d in tqdm(data_wiki128, desc="Scoring Samples"):
    text = d["input"]
    label = int(d["label"])
    # 重要：把需要的 ratio 列表传进去
    s, _ = cal.calculate_scores(text)

    labels.append(label)

    for k in keys_minkpp:
        scores_by_ratio[k].append(safe_get(s, k))
    for k in keys_mink:
        scores_by_ratio[k].append(safe_get(s, k))
    for k in ppl_scores_label:
        scores_by_ppl[k].append(safe_get(s, k))
    for k in scores_perplexity_1:
        scores_by_ppl_1[k].append(safe_get(s, k))
    for k in scores_perplexity_2:
        scores_by_ppl_2[k].append(safe_get(s, k))
    for k in scores_perplexity_3:
        scores_by_ppl_3[k].append(safe_get(s, k))
    for k in scores_perplexity_4:
        scores_by_ppl_4[k].append(safe_get(s, k))

Scoring Samples: 100%|██████████| 100/100 [10:32<00:00,  6.33s/it]


In [9]:
CUSTOM_DIRECTIONS = {
    "perplexity_variance": -1,
    "perplexity_std" : -1,
    "perplexity_range" : -1,
    "perplexity_skewness" : -1,
    "perplexity_kurtosis" : -1
}                                              

labels = np.asarray(labels, dtype=int)
labels_non = (labels == 0)

# ======= 工具：按非成员分布计算 threshold_fpr_alpha =======
def threshold_from_nonmember(non_scores, direction_sign, alpha):
    arr = np.asarray(non_scores, dtype=float)
    arr = arr[np.isfinite(arr)]
    if arr.size == 0:
        return np.nan
    if direction_sign >= 0:
        return float(np.quantile(arr, 1.0 - alpha))  # 越大越像成员 → 取 (1 - alpha) 分位
    else:
        return float(np.quantile(arr, alpha))  # 越小越像成员 → 取 alpha 分位

# ======= 工具：指标方向（默认 mink/mink++ 是正向；ppl统计默认正向，可在 CUSTOM_DIRECTIONS 覆盖）=======
def metric_direction(metric_name: str) -> int:
    if metric_name in CUSTOM_DIRECTIONS:
        return CUSTOM_DIRECTIONS[metric_name]
    low = metric_name.lower()
    if low.startswith("mink") or low.startswith("mink++") or low.startswith("perplexity_"):
        return +1
    return +1

config = {
    "data_name": DATA_NAME,
    "alpha": ALPHA,
    "counts": {
        "total": int(len(labels)),
        "member": int((labels == 1).sum()),
        "non_member": int((labels == 0).sum())
    },
    "items": []
}

from typing import Dict, Any, List, Optional
import numpy as np

# --- ECDF: 控体积，便于上线 ---
def build_ecdf(values: np.ndarray, max_points: int = 2048) -> Dict[str, List[float]]:
    x = np.sort(values.astype(float))
    n = len(x)
    if n == 0:
        return {"xs": [], "cdf": []}
    if n > max_points:
        idx = np.linspace(0, n - 1, num=max_points).astype(int)
        x = x[idx]; n = len(x)
    cdf = (np.arange(1, n+1, dtype=float) / n)
    return {"xs": x.tolist(), "cdf": cdf.tolist()}

# --- Platt 标定（用 sklearn；若无可选跳过） ---
def fit_platt(scores: np.ndarray, labels: np.ndarray) -> Optional[Dict[str, float]]:
    try:
        from sklearn.linear_model import LogisticRegression
    except Exception:
        return None
    X = scores.reshape(-1, 1).astype(float)
    y = labels.astype(int)
    if len(np.unique(y)) < 2:
        return None
    lr = LogisticRegression(solver="lbfgs")
    lr.fit(X, y)
    a = float(lr.coef_[0, 0]); b = float(lr.intercept_[0])
    return {"a": a, "b": b}

# --- 非成员分布阈值（注意：这里假定传入的是“已统一为越大越像成员”的分数） ---
def threshold_from_nonmember_posdir(non_scores_posdir: np.ndarray, alpha: float) -> float:
    arr = np.asarray(non_scores_posdir, dtype=float)
    arr = arr[np.isfinite(arr)]
    if arr.size == 0:
        return np.nan
    # 越大越像成员 → FPR=alpha 时阈值取 (1 - alpha) 分位
    return float(np.quantile(arr, 1.0 - alpha))

def pack_metric(name: str, scores_list: list):
    # 原始分数（未反转）
    scores_raw = np.asarray(scores_list, dtype=float)
    mask = np.isfinite(scores_raw)
    scores_raw = scores_raw[mask]
    labs = labels[mask]
    if np.unique(labs).size < 2 or scores_raw.size == 0:
        return None

    # 方向与反转：在“正向空间”（越大越像成员）里做所有统计/阈值/AUC
    dire = metric_direction(name)   # +1 / -1
    scores_posdir = scores_raw if dire == +1 else -scores_raw

    # === 1) 曲线与阈值（用你现有的 Metrics；传入“正向分数”） ===
    out = Metrics.calculate_metrics(scores_posdir, labs)  # 你的实现不需要 larger_is_positive 参数
    if len(out) == 6:
        auroc, fpr95, tpr05, fpr_list, tpr_list, best_thresh_posdir = out
        best_j = None
    elif len(out) >= 7:
        auroc, fpr95, tpr05, fpr_list, tpr_list, best_thresh_posdir, best_j = out[:7]
    else:
        auroc, fpr95, tpr05 = out[:3]
        best_thresh_posdir, best_j = (np.nan, None)

    # 把 bestJ 阈值“还原回原始方向”的数值（便于人读/一致性存储）
    best_thresh_raw = best_thresh_posdir if dire == +1 else -best_thresh_posdir

    # === 2) 非成员分布阈值 @ FPR=alpha（在正向空间里取 (1-α) 分位，再还原） ===
    non_scores_posdir = scores_posdir[labs == 0]
    tau_alpha_posdir = threshold_from_nonmember_posdir(non_scores_posdir, ALPHA)
    tau_alpha_raw = tau_alpha_posdir if dire == +1 else -tau_alpha_posdir

    # === 3) 统计量（两类分布，正向空间） ===
    memb = scores_posdir[labs == 1]
    nonm = non_scores_posdir
    stats = {
        "posdir_means":   {"member": float(np.mean(memb)) if memb.size else np.nan,
                           "non_member": float(np.mean(nonm)) if nonm.size else np.nan},
        "posdir_stds":    {"member": float(np.std(memb))  if memb.size else np.nan,
                           "non_member": float(np.std(nonm))  if nonm.size else np.nan},
        "posdir_quantiles": {
            "member":     {q: float(np.quantile(memb, q)) for q in [0.01,0.05,0.5,0.95,0.99]} if memb.size else {},
            "non_member": {q: float(np.quantile(nonm, q)) for q in [0.01,0.05,0.5,0.95,0.99]} if nonm.size else {}
        }
    }

    # === 4) ECDF（正向空间；用于线上 p-value / 置信度） ===
    ecdf = {
        "non_member_posdir_ecdf": build_ecdf(nonm),
        # 可选：也存 member 的，便于似然比/贝叶斯后验
        "member_posdir_ecdf": build_ecdf(memb) if memb.size else {"xs": [], "cdf": []}
    }

    # === 5) Platt 标定（正向空间；p = σ(a*s_posdir + b)） ===
    calib = fit_platt(scores_posdir, labs)  # 可能返回 None

    # === 6) 汇总（阈值以“原始方向”存储；同时保留方向符号供线上做比较） ===
    return {
        "metric": name,
        "direction": "+" if dire >= 0 else "-",
        "AUROC": float(auroc),
        "FPR@TPR=95%": float(fpr95),
        "TPR@FPR=5%": float(tpr05),
        "threshold_bestJ": float(best_thresh_raw) if best_thresh_posdir is not None else None,
        "threshold_fpr_alpha": float(tau_alpha_raw) if tau_alpha_posdir is not None else None,
        "Youden_J": (float(best_j) if best_j is not None else None),
        "stats": stats,
        "ecdf": ecdf,
        "calibrator": calib  # 形如 {"a":..., "b":...} 或 None
    }



### 保存常数，config get

In [10]:
# 1) mink++ / mink
for ratio in MINKPP_RATIO:
    r = rstr(ratio)
    cfg1 = pack_metric(f"mink++_{r}", scores_by_ratio[f"mink++_{r}"])
    cfg2 = pack_metric(f"mink_{r}",   scores_by_ratio[f"mink_{r}"])
    if cfg1: config["items"].append(cfg1)
    if cfg2: config["items"].append(cfg2)

# 2) ppl × mink++
for ratio in MINKPP_RATIO:
    r = rstr(ratio)
    for i, bag in enumerate([scores_by_ppl_1, scores_by_ppl_2, scores_by_ppl_3, scores_by_ppl_4], start=1):
        name = f"perplexity_{i}_mink++_{r}"
        cfg = pack_metric(name, bag.get(name, []))
        if cfg: config["items"].append(cfg)

# 3) Other Targets
for name in ppl_scores_label:
    cfg = pack_metric(name, scores_by_ppl[name])
    if cfg: config["items"].append(cfg)

import json
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(config, f, ensure_ascii=False, indent=2)

print(f"✅ threshold config saved to {OUT_JSON}")

✅ threshold config saved to llama_data/threshold_WikiMIA_length32.json
