In [1]:
# ============================================
# CLIP (ViT-L/14) → Linear SVM
# 支援兩種來源：
#  1) 每個 .npy = 一張圖片 (HxW或HxWx3)
#  2) 每個 .npy = 一個 CLIP 向量 (例如 768/1024 維)
# 會自動偵測；也可手動指定 FORCE_MODE = 'image' 或 'feature'
# 目錄：
#  Script/
#    features/
#      clip_real/*.npy
#      clip_fake/*.npy
#    saved_models/   (輸出)
# ============================================

# !pip -q install open_clip_torch scikit-learn pillow tqdm joblib

import os, glob, json, math, time, random, gc
from pathlib import Path
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
import open_clip
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import joblib

# ---------- Config ----------
SCRIPT_ROOT = "/home/yaya/ai-detect-proj/Script"          # <<< 改成你的 Script 目錄
CLIP_REAL_DIR = os.path.join(SCRIPT_ROOT, "features_npy", "clip_real_npy")
CLIP_FAKE_DIR = os.path.join(SCRIPT_ROOT, "features_npy", "clip_fake_npy")
OUTPUT_DIR   = os.path.join(SCRIPT_ROOT, "saved_models")
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_NAME = "ViT-L-14"
PRETRAINED = "openai"
BATCH_SIZE = 64
VAL_SIZE = 0.2
RANDOM_SEED = 1337

# 自動/手動模式：'auto' | 'image' | 'feature'
FORCE_MODE = 'feature'  # <<< 改成 'image' 或 'feature' 以強制指定模式

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda


In [2]:

# ---------- Utils ----------
def list_npy(folder):
    files = sorted(glob.glob(os.path.join(folder, "*.npy")))
    assert len(files) > 0, f"No .npy in {folder}"
    return files

def npy_to_rgb(arr: np.ndarray) -> np.ndarray:
    arr = np.asarray(arr)
    # channel-first 轉 HWC
    if arr.ndim == 3 and arr.shape[0] in (1,3) and arr.shape[-1] not in (1,3):
        arr = np.transpose(arr, (1,2,0))
    if arr.ndim == 2:
        arr = np.stack([arr]*3, axis=-1)
    if arr.dtype != np.uint8:
        arr = np.clip(arr, 0, 1)
        arr = (arr * 255.0).round().astype(np.uint8)
    if arr.shape[-1] == 1:
        arr = np.repeat(arr, 3, axis=-1)
    return arr

def detect_mode(sample_path):
    a = np.load(sample_path, allow_pickle=True)
    # 影像常見：2D或3D且 min(H,W) >= 64；向量常見：1D，或2D但其中一維很小
    is_image = (
        (a.ndim == 2 and min(a.shape) >= 64) or
        (a.ndim == 3 and (a.shape[-1] in (1,3) or a.shape[0] in (1,3)) and max(a.shape[:2]) >= 64)
    )
    return 'image' if is_image else 'feature'

def save_json(obj, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)




In [3]:

# ---------- Load splits from JSON ----------
SPLITS_JSON = os.path.join(OUTPUT_DIR, "splits_clip_feature_iid_ood.json")  # 如有不同檔名請改這行
assert os.path.isfile(SPLITS_JSON), f"找不到 splits json：{SPLITS_JSON}"

with open(SPLITS_JSON, "r", encoding="utf-8") as f:
    SPLITS = json.load(f)["splits"]

train_paths = SPLITS.get("train", [])
val_paths   = SPLITS.get("val", [])
test_iid    = SPLITS.get("test_iid", [])
test_ood    = SPLITS.get("test_ood", [])

def label_from_path(p: str) -> int:
    # 以資料夾判斷：在 clip_real_npy → 0，否則 1
    pnorm = Path(p).as_posix()
    return 0 if ("/clip_real_npy/" in pnorm or pnorm.startswith(Path(CLIP_REAL_DIR).as_posix())) else 1

y_tr = [label_from_path(p) for p in train_paths]
y_va = [label_from_path(p) for p in val_paths]
y_ti = [label_from_path(p) for p in test_iid] if test_iid else []
y_to = [label_from_path(p) for p in test_ood] if test_ood else []

print(f"Train={len(train_paths)}  Val={len(val_paths)}  Test-IID={len(test_iid)}  Test-OOD={len(test_ood)}")

# 依 splits 決定偵測模式（取第一個可用檔）
_first = (train_paths or val_paths or test_iid or test_ood)[0]
mode = FORCE_MODE if FORCE_MODE != 'auto' else detect_mode(_first)
print("Detected mode:", mode)

# ---------- If image mode: load CLIP and extractor ----------
if mode == 'image':
    print("Loading CLIP:", MODEL_NAME, PRETRAINED)
    clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
        MODEL_NAME, pretrained=PRETRAINED
    )
    clip_model = clip_model.to(device).eval()

    @torch.no_grad()
    def clip_embed_batch(np_imgs):
        pil_list = [Image.fromarray(x, mode="RGB") for x in np_imgs]
        tensors = [clip_preprocess(img).unsqueeze(0) for img in pil_list]
        imgs = torch.cat(tensors, dim=0).to(device, non_blocking=True)
        feats = clip_model.encode_image(imgs)
        feats = feats / feats.norm(dim=-1, keepdim=True)
        return feats.float().cpu().numpy()

    def load_feature_vec(p):  # 影像模式下此函式不會用到，但為了 predict_one 一致保留
        v = np.asarray(np.load(p, allow_pickle=True)).astype(np.float32).reshape(-1)
        n = np.linalg.norm(v) + 1e-12
        return v / n

    def extract_features(paths, batch=BATCH_SIZE):
        X=[]
        for i in tqdm(range(0, len(paths), batch), desc="CLIP feats"):
            chunk = [npy_to_rgb(np.load(p, allow_pickle=True)) for p in paths[i:i+batch]]
            X.append(clip_embed_batch(chunk))
            del chunk
            if device == 'cuda': torch.cuda.empty_cache()
        return np.vstack(X)

else:
    # feature mode：每個 .npy 就是一個向量；做 L2 normalize 以對齊 CLIP 特徵慣例
    def load_feature_vec(p):
        v = np.asarray(np.load(p, allow_pickle=True)).astype(np.float32)
        v = v.reshape(-1)  # 支援 (d,) 或 (1,d) 等
        n = np.linalg.norm(v) + 1e-12
        return v / n

    def extract_features(paths, batch=None):
        return np.vstack([load_feature_vec(p) for p in tqdm(paths, desc="Load feats")])

# ---------- Optional: per-class cap（用在 train/val；測試不動） ----------
CAP_TRAIN_PER_CLASS = None   # 例如 70000；None 表示不限制
CAP_VAL_PER_CLASS   = None
AUTO_BALANCE        = True   # True 且 CAP_TRAIN_PER_CLASS=None 時，用少數類樣本數當 cap

def cap_per_class(paths, labels, cap, seed=RANDOM_SEED):
    if cap is None: 
        return paths, labels
    rng = np.random.default_rng(seed)
    idx = np.arange(len(paths))
    idx0 = idx[np.array(labels) == 0]
    idx1 = idx[np.array(labels) == 1]

    def pick(idxs):
        if len(idxs) <= cap: 
            return idxs
        return rng.choice(idxs, size=cap, replace=False)

    keep = np.concatenate([pick(idx0), pick(idx1)])
    rng.shuffle(keep)
    return [paths[i] for i in keep], [labels[i] for i in keep]

if AUTO_BALANCE and CAP_TRAIN_PER_CLASS is None:
    n0 = sum(np.array(y_tr) == 0)
    n1 = sum(np.array(y_tr) == 1)
    CAP_TRAIN_PER_CLASS = min(n0, n1)

train_paths, y_tr = cap_per_class(train_paths, y_tr, CAP_TRAIN_PER_CLASS)
val_paths,   y_va = cap_per_class(val_paths,   y_va, CAP_VAL_PER_CLASS)

print(f"Train (after cap): total={len(train_paths)} | real={sum(np.array(y_tr)==0)} fake={sum(np.array(y_tr)==1)} "
      f"| cap_per_class={CAP_TRAIN_PER_CLASS}")
print(f"Val   (after cap): total={len(val_paths)} | real={sum(np.array(y_va)==0)} fake={sum(np.array(y_va)==1)} "
      f"| cap_per_class={CAP_VAL_PER_CLASS}")

# ---------- Feature Extraction ----------
X_tr = extract_features(train_paths)
X_va = extract_features(val_paths)
assert X_tr.shape[1] == X_va.shape[1], "Feature dim mismatch between train/val"

# ---------- Train SVM ----------
svc = LinearSVC(C=1.0, class_weight="balanced", max_iter=20000, tol=1e-4, dual=False)
print("Fitting LinearSVC...")
svc.fit(X_tr, y_tr)

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

def evaluate(svc, X, y, name="Split"):
    if X is None or len(y) == 0:
        print(f"[{name}] (empty)")
        return float("nan"), float("nan")
    scores = svc.decision_function(X)   # >0 => fake
    preds  = (scores > 0).astype(int)
    acc = accuracy_score(y, preds)
    try:
        auc = roc_auc_score(y, scores)
    except ValueError:
        auc = float("nan")
    print(f"[{name}] Acc={acc:.4f} | AUC={auc:.4f}")
    print(confusion_matrix(y, preds))
    print(classification_report(y, preds, target_names=["real(0)","fake(1)"]))
    return acc, auc

print("=== Train ==="); evaluate(svc, X_tr, y_tr, "Train")
print("=== Val ===");   evaluate(svc, X_va, y_va, "Val")

# 測試（若 JSON 有提供）
X_ti = extract_features(test_iid) if test_iid else None
X_to = extract_features(test_ood) if test_ood else None
if X_ti is not None: evaluate(svc, X_ti, y_ti, "Test-IID")
if X_to is not None: evaluate(svc, X_to, y_to, "Test-OOD")

# ---------- Save ----------
stamp = time.strftime("%Y%m%d_%H%M%S")
model_path = os.path.join(OUTPUT_DIR, f"clip_linear_svm_{mode}_{stamp}.joblib")
joblib.dump(svc, model_path)
meta = {
    "pipeline": "CLIP→LinearSVC",
    "mode": mode,
    "clip_backbone": MODEL_NAME if mode=='image' else "precomputed",
    "pretrained": PRETRAINED if mode=='image' else None,
    "feature_dim": int(X_tr.shape[1]),
    "val_count": len(y_va),
    "train_count": len(y_tr),
    "test_iid_count": len(y_ti) if y_ti else 0,
    "test_ood_count": len(y_to) if y_to else 0,
    "folders": {"real": CLIP_REAL_DIR, "fake": CLIP_FAKE_DIR},
    "splits_json": SPLITS_JSON,
    "random_seed": RANDOM_SEED,
    "created_at": time.strftime("%Y-%m-%d %H:%M:%S")
}
save_json(meta, os.path.join(OUTPUT_DIR, f"clip_linear_svm_meta_{stamp}.json"))
print("Saved:", model_path)

# ---------- Inference helpers（維持相容；會使用上方的 mode 與函式） ----------
@torch.no_grad()
def predict_one(npy_path, svc, mode='auto'):
    mode_local = detect_mode(npy_path) if mode == 'auto' else mode
    if mode_local == 'image':
        global clip_model, clip_preprocess
        if 'clip_model' not in globals():
            clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
                MODEL_NAME, pretrained=PRETRAINED
            )
            clip_model = clip_model.to(device).eval()
        img = npy_to_rgb(np.load(npy_path, allow_pickle=True))
        pil = Image.fromarray(img, mode="RGB")
        x = clip_preprocess(pil).unsqueeze(0).to(device)
        f = clip_model.encode_image(x); f = f / f.norm(dim=-1, keepdim=True)
        v = f.cpu().numpy()
    else:
        v = load_feature_vec(npy_path)[None, :]
    score = svc.decision_function(v)[0]     # >0 => fake
    prob_like_fake = 1 / (1 + math.exp(-score))
    return score, prob_like_fake, int(score > 0)


Train=4  Val=4  Test-IID=4  Test-OOD=4


KeyError: 0

In [8]:
# --- 重新取各 split 的分數（若你還在記憶體就跳過重取） ---
def get_scores(paths):
    X = extract_features(paths)
    return svc.decision_function(X)

scores_va = get_scores(val_paths)
scores_ti = get_scores(test_iid) if test_iid else None
scores_to = get_scores(test_ood) if test_ood else None
y_va = np.array([0 if "/clip_real_npy/" in Path(p).as_posix() else 1 for p in val_paths])
y_ti = np.array([0 if "/clip_real_npy/" in Path(p).as_posix() else 1 for p in test_iid]) if test_iid else None
y_to = np.array([0 if "/clip_real_npy/" in Path(p).as_posix() else 1 for p in test_ood]) if test_ood else None

from sklearn.metrics import roc_curve, f1_score, accuracy_score

# 1) Youden's J：最大化 (TPR - FPR)
fpr, tpr, thr = roc_curve(y_va, scores_va)
j = tpr - fpr
t_best = thr[j.argmax()]

# 2) 或者用驗證集最大 F1 的門檻
def best_f1_threshold(y, s):
    qs = np.quantile(s, np.linspace(0.01, 0.99, 99))  # 掃一圈
    f1s = [f1_score(y, (s>q).astype(int)) for q in qs]
    return qs[int(np.argmax(f1s))]

t_f1 = best_f1_threshold(y_va, scores_va)
print("t_best(J) =", float(t_best), " | t_f1 =", float(t_f1))

def eval_with_threshold(name, y, s, t):
    pred = (s > t).astype(int)
    acc = accuracy_score(y, pred)
    print(f"[{name}] thr={t:.4f} | acc={acc:.4f}")
    print(confusion_matrix(y, pred))
    print(classification_report(y, pred, digits=4))

print("\n== 用驗證集 Youden J 門檻 ==")
eval_with_threshold("Val", y_va, scores_va, t_best)
if scores_ti is not None: eval_with_threshold("Test-IID", y_ti, scores_ti, t_best)
if scores_to is not None: eval_with_threshold("Test-OOD", y_to, scores_to, t_best)

print("\n== 用驗證集 F1 門檻 ==")
eval_with_threshold("Val", y_va, scores_va, t_f1)
if scores_ti is not None: eval_with_threshold("Test-IID", y_ti, scores_ti, t_f1)
if scores_to is not None: eval_with_threshold("Test-OOD", y_to, scores_to, t_f1)


Load feats: 100%|██████████| 18000/18000 [00:00<00:00, 18629.67it/s]
Load feats: 100%|██████████| 18000/18000 [00:00<00:00, 19820.37it/s]
Load feats: 100%|██████████| 43989/43989 [00:02<00:00, 18474.51it/s]


t_best(J) = -0.02346035780521083  | t_f1 = -0.09413094176667464

== 用驗證集 Youden J 門檻 ==
[Val] thr=-0.0235 | acc=0.9766
[[ 5953    47]
 [  374 11626]]
              precision    recall  f1-score   support

           0     0.9409    0.9922    0.9658      6000
           1     0.9960    0.9688    0.9822     12000

    accuracy                         0.9766     18000
   macro avg     0.9684    0.9805    0.9740     18000
weighted avg     0.9776    0.9766    0.9768     18000

[Test-IID] thr=-0.0235 | acc=0.9784
[[ 5956    44]
 [  344 11656]]
              precision    recall  f1-score   support

           0     0.9454    0.9927    0.9685      6000
           1     0.9962    0.9713    0.9836     12000

    accuracy                         0.9784     18000
   macro avg     0.9708    0.9820    0.9760     18000
weighted avg     0.9793    0.9784    0.9786     18000

[Test-OOD] thr=-0.0235 | acc=0.4761
[[ 1968 23021]
 [   27 18973]]
              precision    recall  f1-score   support

       

In [9]:
from sklearn.calibration import CalibratedClassifierCV
# 用現成的 svc 做 sigmoid 校準（預設將內部分成 CV=5，也可改 cv="prefit" + 另外保留的 val）
cal = CalibratedClassifierCV(svc, method="sigmoid", cv=5)
cal.fit(X_tr, y_tr)

def eval_cal(name, paths, y):
    if not paths: 
        print(f"[{name}] (empty)"); return
    X = extract_features(paths)
    proba = cal.predict_proba(X)[:,1]  # P(fake)
    pred  = (proba >= 0.5).astype(int) # 可改 0.5→驗證集找最佳
    acc = accuracy_score(y, pred)
    try:
        auc = roc_auc_score(y, proba)
    except ValueError:
        auc = float("nan")
    print(f"[{name}] acc={acc:.4f} auc={auc:.4f}")
    print(confusion_matrix(y, pred))
    print(classification_report(y, pred, digits=4))

print("\n== 校準後（sigmoid） ==")
eval_cal("Val",      val_paths,   y_va)
if test_iid: eval_cal("Test-IID", test_iid, y_ti)
if test_ood: eval_cal("Test-OOD", test_ood, y_to)



== 校準後（sigmoid） ==


Load feats: 100%|██████████| 18000/18000 [00:00<00:00, 22629.06it/s]


[Val] acc=0.9765 auc=0.9891
[[ 5950    50]
 [  373 11627]]
              precision    recall  f1-score   support

           0     0.9410    0.9917    0.9657      6000
           1     0.9957    0.9689    0.9821     12000

    accuracy                         0.9765     18000
   macro avg     0.9684    0.9803    0.9739     18000
weighted avg     0.9775    0.9765    0.9766     18000



Load feats: 100%|██████████| 18000/18000 [00:00<00:00, 21446.23it/s]


[Test-IID] acc=0.9786 auc=0.9901
[[ 5954    46]
 [  339 11661]]
              precision    recall  f1-score   support

           0     0.9461    0.9923    0.9687      6000
           1     0.9961    0.9718    0.9838     12000

    accuracy                         0.9786     18000
   macro avg     0.9711    0.9820    0.9762     18000
weighted avg     0.9794    0.9786    0.9787     18000



Load feats: 100%|██████████| 43989/43989 [00:01<00:00, 22527.90it/s]


[Test-OOD] acc=0.4748 auc=0.8461
[[ 1911 23078]
 [   25 18975]]
              precision    recall  f1-score   support

           0     0.9871    0.0765    0.1419     24989
           1     0.4512    0.9987    0.6216     19000

    accuracy                         0.4748     43989
   macro avg     0.7192    0.5376    0.3818     43989
weighted avg     0.7556    0.4748    0.3491     43989



In [None]:
# ---------- (取代原本 "Gather paths" 區塊；其餘程式碼保留) ----------

# 讓每個資料集前綴（dataset tag）在各自類別內（real / fake）數量均衡
BALANCE_BY_DATASET   = True                 # 是否啟用依資料集均衡
BALANCE_STRATEGY     = 'undersample'        # 'undersample' | 'oversample' | 'none'
PER_DATASET_CAP_REAL = None                 # 每個資料集在 real 的上限；None 表示用各群組的最小值
PER_DATASET_CAP_FAKE = None                 # 每個資料集在 fake 的上限；None 表示用各群組的最小值
MIN_KEEP_PER_GROUP   = 1                    # 群組太小可丟棄（小於此值就不保留）

# 可選：把常見別名合併（自己再加）
DATASET_ALIASES = {
    'imagenet1k': 'imagenet', 'imgnet': 'imagenet',
    'sd': 'stablediffusion', 'stable-diffusion': 'stablediffusion', 'sdxl': 'sdxl',
    'mj': 'midjourney', 'midj': 'midjourney',
    'dalle3': 'dalle-3', 'dalle2': 'dalle-2','FLUX': 'flux'
}

def canonical_tag(tag: str) -> str:
    tag = tag.lower().strip()
    return DATASET_ALIASES.get(tag, tag)

def infer_dataset_tag(path: str, is_real: bool) -> str:
    """
    從檔名最前段推測資料集前綴；支援分隔符：'__','---','--','_','-',' '
    例：'sdxl__xxxx.npy' → 'sdxl'；'midjourney-000123.npy' → 'midjourney'
    若真實圖片檔名沒有前綴或就是序號，預設標成 'imagenet'
    """
    stem = Path(path).stem
    seps = ['__','---','--','_','-',' ']
    cut = None
    for s in seps:
        i = stem.find(s)
        if i != -1:
            cut = i if cut is None else min(cut, i)
    tag = stem[:cut] if cut is not None else stem
    tag = tag.strip()
    # 若沒有合理前綴或是純數字 → 真實圖片視為 imagenet；否則標 unknown
    if (not tag) or tag.isdigit():
        tag = 'imagenet' if is_real else 'unknown'
    return canonical_tag(tag)

def summarize_counts(paths, is_real: bool):
    counts = {}
    for p in paths:
        t = infer_dataset_tag(p, is_real=is_real)
        counts[t] = counts.get(t, 0) + 1
    return dict(sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])))

def balance_paths(paths, is_real: bool, strategy='undersample', per_dataset_cap=None, min_keep=1):
    """
    依資料集前綴做均衡抽樣；回傳抽樣後的路徑清單
    - undersample: 各群組下砍到 cap
    - oversample: 各群組抽樣（可重複）到 cap
    - none: 不動
    """
    if strategy == 'none':
        return paths

    groups = {}
    for idx, p in enumerate(paths):
        t = infer_dataset_tag(p, is_real=is_real)
        groups.setdefault(t, []).append(idx)

    # 可選：丟棄過小群組
    groups = {t: idxs for t, idxs in groups.items() if len(idxs) >= min_keep}
    if not groups:
        return []

    if per_dataset_cap is None:
        cap = min(len(idxs) for idxs in groups.values())  # 用最小群作標準
    else:
        cap = int(per_dataset_cap)
        if cap <= 0:
            raise ValueError("per_dataset_cap must be positive.")

    rng = np.random.default_rng(RANDOM_SEED)
    selected_indices = []

    if strategy == 'undersample':
        for t, idxs in groups.items():
            k = min(cap, len(idxs))
            choose = rng.choice(idxs, size=k, replace=False)
            selected_indices.extend(choose.tolist())
    elif strategy == 'oversample':
        for t, idxs in groups.items():
            k = max(cap, len(idxs))
            choose = rng.choice(idxs, size=k, replace=True)
            selected_indices.extend(choose.tolist())
    else:
        raise ValueError("BALANCE_STRATEGY must be 'undersample' | 'oversample' | 'none'.")

    # 保持原本順序輕度穩定（也可打亂）
    selected_indices = sorted(set(selected_indices))
    return [paths[i] for i in selected_indices]

# 讀取原始路徑
real_paths_all = list_npy(CLIP_REAL_DIR)
fake_paths_all = list_npy(CLIP_FAKE_DIR)

print("【原始分佈（real）】", summarize_counts(real_paths_all, is_real=True))
print("【原始分佈（fake）】", summarize_counts(fake_paths_all, is_real=False))

# 依資料集均衡
if BALANCE_BY_DATASET:
    real_paths = balance_paths(real_paths_all, is_real=True,
                               strategy=BALANCE_STRATEGY,
                               per_dataset_cap=PER_DATASET_CAP_REAL,
                               min_keep=MIN_KEEP_PER_GROUP)
    fake_paths = balance_paths(fake_paths_all, is_real=False,
                               strategy=BALANCE_STRATEGY,
                               per_dataset_cap=PER_DATASET_CAP_FAKE,
                               min_keep=MIN_KEEP_PER_GROUP)
else:
    real_paths, fake_paths = real_paths_all, fake_paths_all

print("【均衡後分佈（real）】", summarize_counts(real_paths, is_real=True))
print("【均衡後分佈（fake）】", summarize_counts(fake_paths, is_real=False))

# 合併與標籤
all_paths  = real_paths + fake_paths
all_labels = [0]*len(real_paths) + [1]*len(fake_paths)
print(f"Total: {len(all_paths)} | real={len(real_paths)} fake={len(fake_paths)}")

# 自動/手動模式偵測
mode = FORCE_MODE if FORCE_MODE != 'auto' else detect_mode(all_paths[0])
print("Detected mode:", mode)

# 可選：把分佈存檔，方便追蹤實驗
save_json({
    "before": {
        "real": summarize_counts(real_paths_all, True),
        "fake": summarize_counts(fake_paths_all, False)
    },
    "after": {
        "real": summarize_counts(real_paths, True),
        "fake": summarize_counts(fake_paths, False)
    },
    "strategy": BALANCE_STRATEGY,
    "cap_real": PER_DATASET_CAP_REAL,
    "cap_fake": PER_DATASET_CAP_FAKE
}, os.path.join(OUTPUT_DIR, "dataset_balance_clip.json"))
# ---------- (區塊結束) ----------


In [1]:
# ============================================
# Train CLIP (precomputed features) → Linear SVM using your new splits JSON
# - expects JSON structure:
#   splits[split_name]["clip"]["real" or "fake"] = list of .npy paths
# ============================================

# !pip -q install scikit-learn tqdm joblib

import os, json, math, time, numpy as np
from pathlib import Path
from tqdm import tqdm

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, roc_curve, f1_score
import joblib

# ---------- Config ----------
SCRIPT_ROOT = "/home/yaya/ai-detect-proj/Script"
SPLITS_JSON = os.path.join(SCRIPT_ROOT, "saved_models", "splits_clip_feature_iid_ood.json")
OUTPUT_DIR  = os.path.join(SCRIPT_ROOT, "saved_models")
os.makedirs(OUTPUT_DIR, exist_ok=True)

RANDOM_SEED = 1337
np.random.seed(RANDOM_SEED)

# 可選：限制每類最大樣本（train/val）；None 表示不限制
CAP_TRAIN_PER_CLASS = None
CAP_VAL_PER_CLASS   = None

# ---------- Helpers ----------
def load_splits(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data["splits"], data

def paths_labels_from_split(splits, split_name, key="clip"):
    real = splits[split_name][key]["real"]
    fake = splits[split_name][key]["fake"]
    paths = real + fake
    labels = [0]*len(real) + [1]*len(fake)
    # 固定打亂
    idx = np.random.permutation(len(paths))
    return [paths[i] for i in idx], [labels[i] for i in idx]

def cap_per_class(paths, labels, cap, seed=RANDOM_SEED):
    if cap is None: return paths, labels
    rng = np.random.default_rng(seed)
    idx = np.arange(len(paths))
    idx0 = idx[np.array(labels)==0]
    idx1 = idx[np.array(labels)==1]
    def pick(idxs):
        if len(idxs) <= cap: return idxs
        return rng.choice(idxs, size=cap, replace=False)
    keep = np.concatenate([pick(idx0), pick(idx1)])
    rng.shuffle(keep)
    return [paths[i] for i in keep], [labels[i] for i in keep]

def load_feature_vec(p):
    v = np.asarray(np.load(p, allow_pickle=True), dtype=np.float32).reshape(-1)
    n = np.linalg.norm(v) + 1e-12
    return v / n

def extract_features(paths):
    return np.vstack([load_feature_vec(p) for p in tqdm(paths, desc="Load features")])

def evaluate(clf, X, y, name="Split"):
    if X is None or len(y)==0:
        print(f"[{name}] (empty)"); return float("nan"), float("nan")
    scores = clf.decision_function(X)
    preds  = (scores > 0).astype(int)
    acc = accuracy_score(y, preds)
    try:
        auc = roc_auc_score(y, scores)
    except ValueError:
        auc = float("nan")
    print(f"[{name}] Acc={acc:.4f} | AUC={auc:.4f}")
    print(confusion_matrix(y, preds))
    print(classification_report(y, preds, target_names=["real(0)","fake(1)"], digits=4))
    return acc, auc, scores

# ---------- Load splits ----------
SPLITS, RAW = load_splits(SPLITS_JSON)

train_paths, y_tr = paths_labels_from_split(SPLITS, "train", key="clip")
val_paths,   y_va = paths_labels_from_split(SPLITS, "val",   key="clip")
ti_paths,    y_ti = paths_labels_from_split(SPLITS, "test_iid", key="clip") if "test_iid" in SPLITS else ([], [])
to_paths,    y_to = paths_labels_from_split(SPLITS, "test_ood", key="clip") if "test_ood" in SPLITS else ([], [])

# 可選 cap
train_paths, y_tr = cap_per_class(train_paths, y_tr, CAP_TRAIN_PER_CLASS)
val_paths,   y_va = cap_per_class(val_paths,   y_va, CAP_VAL_PER_CLASS)

print(f"Train={len(train_paths)} (real={sum(np.array(y_tr)==0)}, fake={sum(np.array(y_tr)==1)})")
print(f"Val  ={len(val_paths)} (real={sum(np.array(y_va)==0)}, fake={sum(np.array(y_va)==1)})")
print(f"Test-IID={len(ti_paths)} | Test-OOD={len(to_paths)}")

# ---------- Feature extraction ----------
X_tr = extract_features(train_paths)
X_va = extract_features(val_paths)
feat_dim = X_tr.shape[1]
assert X_va.shape[1] == feat_dim

# ---------- Train Linear SVM ----------
clf = LinearSVC(C=1.0, class_weight="balanced", max_iter=20000, tol=1e-4, dual=False)
print("Fitting LinearSVC...")
clf.fit(X_tr, y_tr)

print("\n=== Eval: Train ===")
acc_tr, auc_tr, scores_tr = evaluate(clf, X_tr, y_tr, "Train")
print("\n=== Eval: Val ===")
acc_va, auc_va, scores_va = evaluate(clf, X_va, y_va, "Val")

# Optional tests
scores_ti = scores_to = None
if ti_paths:
    X_ti = extract_features(ti_paths)
    print("\n=== Eval: Test-IID ===")
    acc_ti, auc_ti, scores_ti = evaluate(clf, X_ti, y_ti, "Test-IID")
if to_paths:
    X_to = extract_features(to_paths)
    print("\n=== Eval: Test-OOD ===")
    acc_to, auc_to, scores_to = evaluate(clf, X_to, y_to, "Test-OOD")

# ---------- Thresholds (from Val) ----------
fpr, tpr, thr = roc_curve(y_va, scores_va)
t_bestJ = thr[(tpr - fpr).argmax()]

def best_f1_threshold(y, s):
    qs = np.quantile(s, np.linspace(0.01, 0.99, 99))
    f1s = [f1_score(y, (s>q).astype(int)) for q in qs]
    return float(qs[int(np.argmax(f1s))])

t_f1 = best_f1_threshold(y_va, scores_va)
print(f"\nVal thresholds → YoudenJ:{t_bestJ:.6f}  |  F1:{t_f1:.6f}")

def eval_with_threshold(name, y, s, t):
    if s is None: return
    pred = (s > t).astype(int)
    acc = accuracy_score(y, pred)
    print(f"[{name}] thr={t:.6f} | acc={acc:.4f}")
    print(confusion_matrix(y, pred))
    print(classification_report(y, pred, target_names=["real(0)","fake(1)"], digits=4))

print("\n-- Eval with Val-YoudenJ threshold --")
eval_with_threshold("Val", y_va, scores_va, t_bestJ)
if scores_ti is not None: eval_with_threshold("Test-IID", y_ti, scores_ti, t_bestJ)
if scores_to is not None: eval_with_threshold("Test-OOD", y_to, scores_to, t_bestJ)

print("\n-- Eval with Val-F1 threshold --")
eval_with_threshold("Val", y_va, scores_va, t_f1)
if scores_ti is not None: eval_with_threshold("Test-IID", y_ti, scores_ti, t_f1)
if scores_to is not None: eval_with_threshold("Test-OOD", y_to, scores_to, t_f1)

# ---------- Save model & meta ----------
stamp = time.strftime("%Y%m%d_%H%M%S")
model_path = os.path.join(OUTPUT_DIR, f"clip_linear_svm_feature_{stamp}.joblib")
joblib.dump(clf, model_path)

meta = {
    "pipeline": "CLIP(precomputed)→LinearSVC",
    "feature_dim": int(feat_dim),
    "counts": {
        "train": len(y_tr), "val": len(y_va),
        "test_iid": len(y_ti), "test_ood": len(y_to)
    },
    "thresholds": {"val_youdenJ": float(t_bestJ), "val_f1": float(t_f1)},
    "splits_json": SPLITS_JSON,
    "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
    "random_seed": RANDOM_SEED
}
with open(os.path.join(OUTPUT_DIR, f"clip_linear_svm_meta_{stamp}.json"), "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("\n✅ Saved model:", model_path)

# ---------- Quick inference helper ----------
def predict_one_feature_npy(npy_path, clf):
    v = load_feature_vec(npy_path)[None, :]
    score = clf.decision_function(v)[0]    # >0 => fake
    prob  = 1 / (1 + math.exp(-score))
    pred  = int(score > 0)
    return score, prob, pred


Train=112000 (real=56000, fake=56000)
Val  =14000 (real=7000, fake=7000)
Test-IID=14000 | Test-OOD=17570


Load features: 100%|██████████| 112000/112000 [00:43<00:00, 2553.97it/s]
Load features: 100%|██████████| 14000/14000 [00:04<00:00, 2859.89it/s]


Fitting LinearSVC...

=== Eval: Train ===
[Train] Acc=0.9385 | AUC=0.9757
[[53488  2512]
 [ 4379 51621]]
              precision    recall  f1-score   support

     real(0)     0.9243    0.9551    0.9395     56000
     fake(1)     0.9536    0.9218    0.9374     56000

    accuracy                         0.9385    112000
   macro avg     0.9390    0.9385    0.9385    112000
weighted avg     0.9390    0.9385    0.9385    112000


=== Eval: Val ===
[Val] Acc=0.9360 | AUC=0.9738
[[6659  341]
 [ 555 6445]]
              precision    recall  f1-score   support

     real(0)     0.9231    0.9513    0.9370      7000
     fake(1)     0.9497    0.9207    0.9350      7000

    accuracy                         0.9360     14000
   macro avg     0.9364    0.9360    0.9360     14000
weighted avg     0.9364    0.9360    0.9360     14000



Load features: 100%|██████████| 14000/14000 [00:04<00:00, 2866.46it/s]



=== Eval: Test-IID ===
[Test-IID] Acc=0.9381 | AUC=0.9767
[[6677  323]
 [ 544 6456]]
              precision    recall  f1-score   support

     real(0)     0.9247    0.9539    0.9390      7000
     fake(1)     0.9524    0.9223    0.9371      7000

    accuracy                         0.9381     14000
   macro avg     0.9385    0.9381    0.9381     14000
weighted avg     0.9385    0.9381    0.9381     14000



Load features: 100%|██████████| 17570/17570 [00:06<00:00, 2757.45it/s]



=== Eval: Test-OOD ===
[Test-OOD] Acc=0.8618 | AUC=0.9418
[[7558 1227]
 [1202 7583]]
              precision    recall  f1-score   support

     real(0)     0.8628    0.8603    0.8616      8785
     fake(1)     0.8607    0.8632    0.8619      8785

    accuracy                         0.8618     17570
   macro avg     0.8618    0.8618    0.8618     17570
weighted avg     0.8618    0.8618    0.8618     17570


Val thresholds → YoudenJ:0.057070  |  F1:0.039132

-- Eval with Val-YoudenJ threshold --
[Val] thr=0.057070 | acc=0.9370
[[6712  288]
 [ 594 6406]]
              precision    recall  f1-score   support

     real(0)     0.9187    0.9589    0.9383      7000
     fake(1)     0.9570    0.9151    0.9356      7000

    accuracy                         0.9370     14000
   macro avg     0.9378    0.9370    0.9370     14000
weighted avg     0.9378    0.9370    0.9370     14000

[Test-IID] thr=0.057070 | acc=0.9384
[[6723  277]
 [ 586 6414]]
              precision    recall  f1-score   s

In [2]:
# === OOD 分來源診斷（哪個來源最常被誤判） ===
import json, numpy as np
from pathlib import Path
from collections import defaultdict
from sklearn.metrics import accuracy_score, classification_report

SCRIPT_ROOT = "/home/yaya/ai-detect-proj/Script"
SPLITS_JSON = f"{SCRIPT_ROOT}/saved_models/splits_clip_feature_iid_ood.json"
MODEL_PATH  = sorted(Path(f"{SCRIPT_ROOT}/saved_models").glob("clip_linear_svm_feature_*.joblib"))[-1]

import joblib
clf = joblib.load(MODEL_PATH)

with open(SPLITS_JSON, "r", encoding="utf-8") as f:
    J = json.load(f)["splits"]

ood_real = J["test_ood"]["clip"]["real"]
ood_fake = J["test_ood"]["clip"]["fake"]
paths = ood_real + ood_fake
y = np.array([0]*len(ood_real) + [1]*len(ood_fake))

def src_of(stem: str):
    s = stem.lower()
    for k in ["places365","coco2017","midjourney","dalle3","unsplash","flickr30k","imagenet","div2k","flux","sd3"]:
        if s.startswith(k) or (k in s):
            return k
    return "other"

def load_vec(p):
    v = np.asarray(np.load(p, allow_pickle=True), dtype=np.float32).reshape(-1)
    n = np.linalg.norm(v) + 1e-12
    return v/n

X = np.vstack([load_vec(p) for p in paths])
scores = clf.decision_function(X)
pred = (scores>0).astype(int)

by_src = defaultdict(list)
for p, yt, yp in zip(paths, y, pred):
    by_src[src_of(Path(p).stem)].append((yt, yp))

for k, L in sorted(by_src.items(), key=lambda kv:-len(kv[1])):
    yt = np.array([t for t,_ in L]); yp = np.array([p for _,p in L])
    acc = accuracy_score(yt, yp)
    print(f"{k:12s} | n={len(L):5d} | acc={acc:.3f} | real_recall={((yp==0)&(yt==0)).sum()/max((yt==0).sum(),1):.3f} | fake_recall={((yp==1)&(yt==1)).sum()/max((yt==1).sum(),1):.3f}")


coco2017     | n= 8785 | acc=0.860 | real_recall=0.860 | fake_recall=0.000
midjourney   | n= 6376 | acc=0.824 | real_recall=0.000 | fake_recall=0.824
dalle3       | n= 2409 | acc=0.968 | real_recall=0.000 | fake_recall=0.968


In [3]:
# === StandardScaler + PCA(256) + LinearSVC（用你現有 splits） ===
from pathlib import Path
import json, numpy as np, time
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

SCRIPT_ROOT = "/home/yaya/ai-detect-proj/Script"
SPLITS_JSON = f"{SCRIPT_ROOT}/saved_models/splits_clip_feature_iid_ood.json"
OUTPUT_DIR  = f"{SCRIPT_ROOT}/saved_models"

with open(SPLITS_JSON,"r",encoding="utf-8") as f:
    S = json.load(f)["splits"]

def load_vec(p):
    v = np.asarray(np.load(p, allow_pickle=True), dtype=np.float32).reshape(-1)
    n = np.linalg.norm(v) + 1e-12
    return v/n

def paths_labels(split):
    r = S[split]["clip"]["real"]; f = S[split]["clip"]["fake"]
    Xp = r+f; y = np.array([0]*len(r)+[1]*len(f))
    idx = np.random.permutation(len(Xp))
    return [Xp[i] for i in idx], y[idx]

def feats(paths): return np.vstack([load_vec(p) for p in tqdm(paths, desc="load")])

tr_p,y_tr = paths_labels("train"); va_p,y_va = paths_labels("val")
ti_p,y_ti = paths_labels("test_iid"); to_p,y_to = paths_labels("test_ood")

X_tr = feats(tr_p); X_va = feats(va_p)

scaler = StandardScaler(with_mean=True, with_std=True).fit(X_tr)
X_tr_s = scaler.transform(X_tr); X_va_s = scaler.transform(X_va)

pca_dim = 256
pca = PCA(n_components=pca_dim, random_state=1337).fit(X_tr_s)
X_tr_p = pca.transform(X_tr_s); X_va_p = pca.transform(X_va_s)

clf = LinearSVC(C=1.0, class_weight="balanced", max_iter=20000, dual=False).fit(X_tr_p, y_tr)

def eval_block(name, paths, y):
    if not paths: return
    X = feats(paths); Xs = scaler.transform(X); Xp = pca.transform(Xs)
    sc = clf.decision_function(Xp); pr = (sc>0).astype(int)
    acc = accuracy_score(y, pr)
    auc = roc_auc_score(y, sc)
    print(f"[{name}] Acc={acc:.4f} | AUC={auc:.4f}")
    print(confusion_matrix(y, pr))
    print(classification_report(y, pr, target_names=["real(0)","fake(1)"], digits=4))

print("=== Train ===");   eval_block("Train", tr_p, y_tr)
print("=== Val ===");     eval_block("Val",   va_p, y_va)
print("=== Test-IID ===");eval_block("Test-IID", ti_p, y_ti)
print("=== Test-OOD ===");eval_block("Test-OOD", to_p, y_to)

# 存模型（含 scaler/pca）
import joblib, time
stamp = time.strftime("%Y%m%d_%H%M%S")
joblib.dump({"scaler":scaler,"pca":pca,"svc":clf},
            f"{OUTPUT_DIR}/clip_pca256_svm_{stamp}.joblib")
print("✅ saved:", f"{OUTPUT_DIR}/clip_pca256_svm_{stamp}.joblib")


load: 100%|██████████| 112000/112000 [00:34<00:00, 3257.37it/s]
load: 100%|██████████| 14000/14000 [00:00<00:00, 19363.08it/s]


=== Train ===


load: 100%|██████████| 112000/112000 [00:05<00:00, 21195.03it/s]


[Train] Acc=0.9271 | AUC=0.9701
[[52726  3274]
 [ 4887 51113]]
              precision    recall  f1-score   support

     real(0)     0.9152    0.9415    0.9282     56000
     fake(1)     0.9398    0.9127    0.9261     56000

    accuracy                         0.9271    112000
   macro avg     0.9275    0.9271    0.9271    112000
weighted avg     0.9275    0.9271    0.9271    112000

=== Val ===


load: 100%|██████████| 14000/14000 [00:00<00:00, 24968.77it/s]


[Val] Acc=0.9264 | AUC=0.9691
[[6574  426]
 [ 604 6396]]
              precision    recall  f1-score   support

     real(0)     0.9159    0.9391    0.9274      7000
     fake(1)     0.9376    0.9137    0.9255      7000

    accuracy                         0.9264     14000
   macro avg     0.9267    0.9264    0.9264     14000
weighted avg     0.9267    0.9264    0.9264     14000

=== Test-IID ===


load: 100%|██████████| 14000/14000 [00:00<00:00, 20616.16it/s]


[Test-IID] Acc=0.9289 | AUC=0.9706
[[6607  393]
 [ 602 6398]]
              precision    recall  f1-score   support

     real(0)     0.9165    0.9439    0.9300      7000
     fake(1)     0.9421    0.9140    0.9279      7000

    accuracy                         0.9289     14000
   macro avg     0.9293    0.9289    0.9289     14000
weighted avg     0.9293    0.9289    0.9289     14000

=== Test-OOD ===


load: 100%|██████████| 17570/17570 [00:00<00:00, 20814.11it/s]


[Test-OOD] Acc=0.8542 | AUC=0.9401
[[7207 1578]
 [ 984 7801]]
              precision    recall  f1-score   support

     real(0)     0.8799    0.8204    0.8491      8785
     fake(1)     0.8318    0.8880    0.8590      8785

    accuracy                         0.8542     17570
   macro avg     0.8558    0.8542    0.8540     17570
weighted avg     0.8558    0.8542    0.8540     17570

✅ saved: /home/yaya/ai-detect-proj/Script/saved_models/clip_pca256_svm_20250820_194313.joblib


In [4]:
# === 用 CalibratedClassifierCV 做校準 ===
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC

# 仍承接上一步的 StandardScaler/PCA（或你第一版的 X_tr/X_va 也可）
base_svc = LinearSVC(C=1.0, class_weight="balanced", max_iter=20000, dual=False)
cal = CalibratedClassifierCV(base_svc, method="sigmoid", cv=3)  # 或 method="isotonic"
cal.fit(X_tr_p, y_tr)

def eval_cal(name, paths, y):
    if not paths: return
    X = feats(paths); Xs = scaler.transform(X); Xp = pca.transform(Xs)
    proba = cal.predict_proba(Xp)[:,1]   # 機率：越大越像 fake
    pred  = (proba > 0.5).astype(int)
    acc = accuracy_score(y, pred)
    try:
        auc = roc_auc_score(y, proba)
    except:
        auc = float("nan")
    print(f"[{name}] (calibrated) Acc={acc:.4f} | AUC={auc:.4f}")
    print(confusion_matrix(y, pred))
    print(classification_report(y, pred, target_names=["real(0)","fake(1)"], digits=4))

print("=== Calibrated on PCA256 ===")
eval_cal("Val", va_p, y_va)
eval_cal("Test-IID", ti_p, y_ti)
eval_cal("Test-OOD", to_p, y_to)


KeyboardInterrupt: 