# Pack Features by Modality (CLIP / ELA / PRNU) → Split Volumes → (Optional) Upload to Google Drive

這份 Notebook 會：
- **自動偵測** `Script/` 底下的三種模態目錄（CLIP / ELA / PRNU）與 `real/fake` 類別
- 將每個 **模態 × 類別** 的檔案打成一個「tar+zstd 串流」，並依 `VOLUME_SIZE` 自動**分卷**
- 為每個打包生成 **MANIFEST** 與 **SHA256SUMS**
- （可選）把分卷與校驗檔上傳到 **Google Drive** 指定資料夾

> 不需要事先準備 upload list。所有 `.npy/.npz` 會直接從對應資料夾收集。


In [17]:
# === 依賴安裝（自動處理 PEP 668） ===
import sys, subprocess

def safe_pip_install(pkgs):
    try:
        print("[pip] Installing:", pkgs)
        subprocess.check_call([sys.executable, "-m", "pip", "install", *pkgs])
    except subprocess.CalledProcessError:
        print("[pip] Normal install failed, retry with --break-system-packages ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--break-system-packages", *pkgs])

safe_pip_install(["zstandard", "tqdm", "pydrive2"])

[pip] Installing: ['zstandard', 'tqdm', 'pydrive2']


## 參數設定
- `SCRIPT_ROOT`: 你的專案根（含 `features_npy/`、`features_quant/`）
- `VOLUME_SIZE`: 分卷大小（例：`"1500M"`, `"2G"`, `"800M"`）
- `PACK_MODALITIES`: 要打包的模態（`"clip"`, `"ela"`, `"prnu"`）
- `PACK_CLASSES`: 要打包的類別（`"real"`, `"fake"`）
- `OUT_DIR`: 產物輸出資料夾（預設 `saved_models/packs`）
- `UPLOAD`: 是否上傳到 Google Drive
- `DRIVE_FOLDER`: Google Drive 目的資料夾（位於 **我的雲端硬碟**）
- `CLIENT_SECRETS`: Google OAuth 的 `client_secrets.json`（第一次會跳授權）

> 會自動從 **候選路徑** 中選「第一個存在」的資料夾：
> - CLIP：`features_npy/clip_real_npy`、`features_npy/clip_fake_npy`
> - ELA：`features_npy/ela_*`（會優先選 `ela_*_128_npz`，若沒有才用 `ela_*_npy`）
> - PRNU：優先 `features_quant/prnu_*_i8_256` → `prnu_*_i8` → `features_npy/prnu_*_npy`


In [1]:
# === 修改這裡 ===
SCRIPT_ROOT = "/home/yaya/ai-detect-proj/Script"
VOLUME_SIZE = "1500M"

PACK_MODALITIES = ["prnu"]   # 可改：["ela","prnu"]
PACK_CLASSES    = ["real", "fake"]          # 可改：["real"] 或 ["fake"]

OUT_DIR = None  # None -> 預設放到 Script/saved_models/packs
UPLOAD = True  # 是否上傳到 Google Drive
DRIVE_FOLDER = "ai-detect-proj/features"
CLIENT_SECRETS = "./client_secrets.json"  # 放在目前目錄或填絕對路徑

In [9]:
# === 工具 & 打包實作 ===
import os, sys, time, tarfile, hashlib, json
from pathlib import Path
from tqdm import tqdm
import zstandard as zstd

try:
    from pydrive2.auth import GoogleAuth
    from pydrive2.drive import GoogleDrive
    _HAS_PYDRIVE2 = True
except Exception:
    _HAS_PYDRIVE2 = False

def parse_size(s: str) -> int:
    s = s.strip().upper()
    if s.endswith("G"): return int(float(s[:-1]) * (1024**3))
    if s.endswith("M"): return int(float(s[:-1]) * (1024**2))
    if s.endswith("K"): return int(float(s[:-1]) * 1024)
    return int(s)

def human_bytes(n: int) -> str:
    for unit in ["B","KiB","MiB","GiB","TiB"]:
        if abs(n) < 1024.0:
            return f"{n:3.1f} {unit}"
        n /= 1024.0
    return f"{n:.1f} PiB"

def pick_first_existing(paths):
    for p in paths:
        if p and os.path.isdir(p):
            return p
    return None

def list_feature_files(folder, exts=(".npy",".npy.zst")):
    if not folder: return []
    paths = []
    for ext in exts:
        paths += [str(p) for p in Path(folder).glob(f"*{ext}")]
    return sorted(paths)

class SplitWriter:
    def __init__(self, base_path: Path, volume_size: int):
        self.base_path = Path(base_path)
        self.volume_size = volume_size
        self.part_idx = 0
        self.cur_fp = None
        self.cur_written = 0
        self.sha256 = hashlib.sha256()
        self.parts_info = []
        self._open_next()

    def _open_next(self):
        if self.cur_fp:
            self.cur_fp.flush(); self.cur_fp.close()
            self.parts_info.append({
                "path": str(self.cur_path),
                "size": self.cur_written,
                "sha256": self.sha256.hexdigest()
            })
        self.part_idx += 1
        self.cur_path = Path(f"{self.base_path}.part-{self.part_idx:04d}")
        self.cur_fp = open(self.cur_path, "wb")
        self.cur_written = 0
        self.sha256 = hashlib.sha256()

    def write(self, data: bytes):
        mv = memoryview(data)
        offset = 0
        remaining = len(mv)
        while remaining > 0:
            room = self.volume_size - self.cur_written
            if room == 0:
                self._open_next()
                room = self.volume_size
            chunk_len = remaining if remaining <= room else room
            chunk = mv[offset:offset+chunk_len]
            self.cur_fp.write(chunk)
            self.sha256.update(chunk)
            self.cur_written += chunk_len
            offset += chunk_len
            remaining -= chunk_len

    def flush(self):
        if self.cur_fp:
            self.cur_fp.flush()

    def close(self):
        if self.cur_fp:
            self.cur_fp.flush(); self.cur_fp.close()
            self.parts_info.append({
                "path": str(self.cur_path),
                "size": self.cur_written,
                "sha256": self.sha256.hexdigest()
            })
            self.cur_fp = None

def ensure_folder(drive, parent_id, name):
    q = f'title="{name}" and mimeType="application/vnd.google-apps.folder" and trashed=false and "{parent_id}" in parents'
    flist = drive.ListFile({'q': q}).GetList()
    if flist: return flist[0]['id']
    folder = drive.CreateFile({
        'title': name,
        'mimeType': 'application/vnd.google-apps.folder',
        'parents': [{'id': parent_id}]
    })
    folder.Upload()
    return folder['id']

def upload_files_to_drive(local_paths, drive_folder_path, client_secrets="client_secrets.json"):
    if not _HAS_PYDRIVE2:
        raise RuntimeError("未安裝 pydrive2，請先安裝")
    gauth = GoogleAuth()
    gauth.LoadClientConfigFile(client_secrets)
    gauth.LoadCredentialsFile("token.json")
    if gauth.credentials is None:
        gauth.LocalWebserverAuth()
    elif gauth.access_token_expired:
        gauth.Refresh()
    else:
        gauth.Authorize()
    gauth.SaveCredentialsFile("token.json")

    drive = GoogleDrive(gauth)
    parent = 'root'
    if drive_folder_path.strip():
        for seg in drive_folder_path.strip("/").split("/"):
            parent = ensure_folder(drive, parent, seg)

    for p in local_paths:
        f = drive.CreateFile({'title': Path(p).name, 'parents': [{'id': parent}]})
        f.SetContentFile(p)
        f.Upload()
        print(f"[GDrive] Uploaded: {p}")

def pack_feature_set(files, pack_base_path: Path, volume_size_str: str, zstd_level=6):
    files = [Path(f).resolve() for f in files if Path(f).is_file()]
    if not files: 
        print(f"[SKIP] No files for {pack_base_path.name}")
        return None

    vol_bytes = parse_size(volume_size_str)
    part_prefix = f"{pack_base_path}.tar.zst"
    out_dir = pack_base_path.parent
    out_dir.mkdir(parents=True, exist_ok=True)

    total_src_bytes = sum(p.stat().st_size for p in files)
    print(f"== 打包 {pack_base_path.name} ==")
    print(f"  檔案數：{len(files)}  原始容量：{human_bytes(total_src_bytes)}")
    print(f"  分卷大小：{volume_size_str}  輸出：{part_prefix}.part-0001 ...")

    # tar 內用檔名（避免寫入絕對路徑）
    splitter = SplitWriter(Path(part_prefix), vol_bytes)
    zc = zstd.ZstdCompressor(level=zstd_level, threads=-1)
    with zc.stream_writer(splitter) as zfh:
        with tarfile.open(mode="w|", fileobj=zfh, format=tarfile.GNU_FORMAT) as tar:
            pbar = tqdm(total=len(files), desc="Packing", unit="file")
            for src in files:
                tar.add(str(src), arcname=src.name, recursive=False)
                pbar.update(1)
            pbar.close()
    splitter.close()

    # 寫 manifest & sums
    ts = time.strftime("%Y%m%d_%H%M%S")
    parts_info = splitter.parts_info
    manifest = {
        "pack_name": pack_base_path.name,
        "created_at": ts,
        "out_dir": str(out_dir),
        "volume_size": volume_size_str,
        "total_source_files": len(files),
        "total_source_bytes": total_src_bytes,
        "parts": parts_info,
    }
    manifest_path = out_dir / f"{pack_base_path.name}_MANIFEST_{ts}.json"
    sums_path = out_dir / f"{pack_base_path.name}_SHA256SUMS_{ts}.txt"
    with open(manifest_path, "w", encoding="utf-8") as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)
    with open(sums_path, "w", encoding="utf-8") as f:
        for pi in parts_info:
            f.write(f"{pi['sha256']}  {os.path.basename(pi['path'])}\n")

    print("== 完成：")
    for pi in parts_info[:10]:
        print(f"  {os.path.basename(pi['path'])}  {human_bytes(pi['size'])}  sha256={pi['sha256'][:12]}...")
    if len(parts_info) > 10:
        print(f"  ...（共 {len(parts_info)} 卷）")
    print(f"  MANIFEST：{manifest_path.name}")
    print(f"  SHA256SUMS：{sums_path.name}")

    return {
        "parts": [pi["path"] for pi in parts_info],
        "manifest": str(manifest_path),
        "sums": str(sums_path),
    }

In [7]:
# === 自動偵測各模態的資料夾 ===
from pathlib import Path

CANDIDATES = {
    "clip": {
        "real": [f"{SCRIPT_ROOT}/features_npy/clip_real_npy"],
        "fake": [f"{SCRIPT_ROOT}/features_npy/clip_fake_npy"],
    },
    "ela": {
        "real": [f"{SCRIPT_ROOT}/features_npy/ela_real_128_npz",
                 f"{SCRIPT_ROOT}/features_npy/ela_real_npy"],
        "fake": [f"{SCRIPT_ROOT}/features_npy/ela_fake_128_npz",
                 f"{SCRIPT_ROOT}/features_npy/ela_fake_npy"],
    },
    "prnu": {
        "real": [f"{SCRIPT_ROOT}/features_i8/prnu_real_i8_npy",
                 f"{SCRIPT_ROOT}/features_npy/prnu_real_npy",
                 f"{SCRIPT_ROOT}/features_npy/prnu_real_npy"],
        "fake": [f"{SCRIPT_ROOT}/features_i8/prnu_fake_i8_npy",
                 f"{SCRIPT_ROOT}/features_npy/prnu_fake_i8_npy",
                 f"{SCRIPT_ROOT}/features_npy/prnu_fake_npy"],
    },
}

chosen = {}
for mod in PACK_MODALITIES:
    chosen[mod] = {}
    for cls in PACK_CLASSES:
        folder = pick_first_existing(CANDIDATES.get(mod, {}).get(cls, []))
        chosen[mod][cls] = folder

print("== 選用資料夾 ==")
for mod in chosen:
    for cls in chosen[mod]:
        print(f"- {mod}/{cls}:", chosen[mod][cls])

== 選用資料夾 ==
- prnu/real: /home/yaya/ai-detect-proj/Script/features_i8/prnu_real_i8_npy
- prnu/fake: /home/yaya/ai-detect-proj/Script/features_i8/prnu_fake_i8_npy


In [10]:
# === 預覽各組將要打包的數量與容量 ===
def dir_size(paths):
    import os
    total=0
    for p in paths:
        try: total+=os.path.getsize(p)
        except: pass
    return total

preview = {}
for mod in PACK_MODALITIES:
    for cls in PACK_CLASSES:
        folder = chosen.get(mod, {}).get(cls)
        files = list_feature_files(folder)
        if not files: 
            print(f"[WARN] 無檔案：{mod}/{cls}")
            continue
        size = dir_size(files)
        print(f"{mod}/{cls}: files={len(files):>7} size≈{human_bytes(size)} folder={folder}")
        preview[(mod,cls)] = (files, size)

prnu/real: files= 147511 size≈5.3 GiB folder=/home/yaya/ai-detect-proj/Script/features_i8/prnu_real_i8_npy
prnu/fake: files= 139000 size≈4.2 GiB folder=/home/yaya/ai-detect-proj/Script/features_i8/prnu_fake_i8_npy


In [11]:
# === 逐組打包（模態×類別） ===
ts = time.strftime("%Y%m%d_%H%M%S")
OUT_BASE = Path(OUT_DIR) if OUT_DIR else Path(SCRIPT_ROOT) / "saved_models" / "packs"
OUT_BASE.mkdir(parents=True, exist_ok=True)

all_outputs = {}

for (mod, cls), (files, size) in preview.items():
    pack_name = f"pack_{mod}_{cls}_{ts}"
    pack_base = OUT_BASE / pack_name
    out = pack_feature_set(files, pack_base, VOLUME_SIZE, zstd_level=6)
    if out:
        all_outputs[(mod,cls)] = out

print("\n== Summary of outputs ==")
for k, out in all_outputs.items():
    mod, cls = k
    print(f"- {mod}/{cls}: parts={len(out['parts'])} manifest={Path(out['manifest']).name}")

== 打包 pack_prnu_real_20250819_003208 ==
  檔案數：147511  原始容量：5.3 GiB
  分卷大小：1500M  輸出：/home/yaya/ai-detect-proj/Script/saved_models/packs/pack_prnu_real_20250819_003208.tar.zst.part-0001 ...


Packing: 100%|██████████| 147511/147511 [01:46<00:00, 1380.62file/s]


== 完成：
  pack_prnu_real_20250819_003208.tar.zst.part-0001  1.5 GiB  sha256=3d955e53e7a5...
  pack_prnu_real_20250819_003208.tar.zst.part-0002  1.5 GiB  sha256=2784e9999fa1...
  pack_prnu_real_20250819_003208.tar.zst.part-0003  1.5 GiB  sha256=ced38b74aa1a...
  pack_prnu_real_20250819_003208.tar.zst.part-0004  950.2 MiB  sha256=cd6c50f4fac1...
  MANIFEST：pack_prnu_real_20250819_003208_MANIFEST_20250819_003400.json
  SHA256SUMS：pack_prnu_real_20250819_003208_SHA256SUMS_20250819_003400.txt
== 打包 pack_prnu_fake_20250819_003208 ==
  檔案數：139000  原始容量：4.2 GiB
  分卷大小：1500M  輸出：/home/yaya/ai-detect-proj/Script/saved_models/packs/pack_prnu_fake_20250819_003208.tar.zst.part-0001 ...


Packing: 100%|██████████| 139000/139000 [01:55<00:00, 1203.40file/s]

== 完成：
  pack_prnu_fake_20250819_003208.tar.zst.part-0001  1.5 GiB  sha256=532e71ddb85c...
  pack_prnu_fake_20250819_003208.tar.zst.part-0002  1.5 GiB  sha256=bccb2cec1f05...
  pack_prnu_fake_20250819_003208.tar.zst.part-0003  1.2 GiB  sha256=32415e5cad0d...
  MANIFEST：pack_prnu_fake_20250819_003208_MANIFEST_20250819_003604.json
  SHA256SUMS：pack_prnu_fake_20250819_003208_SHA256SUMS_20250819_003604.txt

== Summary of outputs ==
- prnu/real: parts=4 manifest=pack_prnu_real_20250819_003208_MANIFEST_20250819_003400.json
- prnu/fake: parts=3 manifest=pack_prnu_fake_20250819_003208_MANIFEST_20250819_003604.json





In [25]:
...# === （可選）上傳 Google Drive ===
if UPLOAD:
    to_upload = []
    for out in all_outputs.values():
        to_upload += out["parts"]
        to_upload += [out["manifest"], out["sums"]]
    print(f"== 上傳至 Google Drive: {DRIVE_FOLDER} ==")
    upload_files_to_drive(to_upload, DRIVE_FOLDER, client_secrets=CLIENT_SECRETS)
    print("✅ 上傳完成")
else:
    print("（UPLOAD=False，略過上傳）")

== 上傳至 Google Drive: ai-detect-proj/features ==
Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=412129929391-c6m34kv6b6hli92ng5psss4gremv2fut.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=online&response_type=code



gio: https://accounts.google.com/o/oauth2/auth?client_id=412129929391-c6m34kv6b6hli92ng5psss4gremv2fut.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=online&response_type=code: Operation not supported


Authentication successful.
[GDrive] Uploaded: /home/yaya/ai-detect-proj/Script/saved_models/packs/pack_ela_real_20250816_233151.tar.zst.part-0001
[GDrive] Uploaded: /home/yaya/ai-detect-proj/Script/saved_models/packs/pack_ela_real_20250816_233151.tar.zst.part-0002
[GDrive] Uploaded: /home/yaya/ai-detect-proj/Script/saved_models/packs/pack_ela_real_20250816_233151_MANIFEST_20250816_233331.json
[GDrive] Uploaded: /home/yaya/ai-detect-proj/Script/saved_models/packs/pack_ela_real_20250816_233151_SHA256SUMS_20250816_233331.txt
[GDrive] Uploaded: /home/yaya/ai-detect-proj/Script/saved_models/packs/pack_ela_fake_20250816_233151.tar.zst.part-0001


KeyboardInterrupt: 