# Pack & Upload (Split Tar+Zstd → Google Drive)

這份 notebook 會：

1. 安裝需要的套件（`zstandard`, `tqdm`, `pydrive2`）。
2. 以「**tar 串流 + zstd**」壓縮大量檔案，並**自動分卷**（例：1.5GB/卷）。
3. 產生 **SHA256SUMS** 與 **MANIFEST**（紀錄每一卷的大小與雜湊）。
4. （可選）使用 **PyDrive2** 上傳到 **Google Drive** 指定資料夾。

> 若你的系統啟用了 PEP 668（阻擋 pip 寫入系統 site-packages），
> 下面的安裝步驟會自動使用 `--break-system-packages` 做 fallback。

In [None]:
# === 依賴安裝（自動處理 PEP 668 的限制） ===
import sys, subprocess

def safe_pip_install(pkgs):
    try:
        print("[pip] Installing:", pkgs)
        subprocess.check_call([sys.executable, "-m", "pip", "install", *pkgs])
    except subprocess.CalledProcessError as e:
        print("[pip] Normal install failed, retry with --break-system-packages ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--break-system-packages", *pkgs])

safe_pip_install(["zstandard", "tqdm", "pydrive2"])

In [1]:
# === 核心工具（分卷 writer、打包、校驗、上傳） ===
import os, sys, time, json, tarfile, hashlib, argparse
from pathlib import Path
from tqdm import tqdm
import zstandard as zstd

try:
    from pydrive2.auth import GoogleAuth
    from pydrive2.drive import GoogleDrive
    _HAS_PYDRIVE2 = True
except Exception:
    _HAS_PYDRIVE2 = False

def parse_size(s: str) -> int:
    s = s.strip().upper()
    if s.endswith("G"):
        return int(float(s[:-1]) * (1024**3))
    if s.endswith("M"):
        return int(float(s[:-1]) * (1024**2))
    if s.endswith("K"):
        return int(float(s[:-1]) * 1024)
    return int(s)

def human_bytes(n: int) -> str:
    for unit in ["B","KiB","MiB","GiB","TiB"]:
        if abs(n) < 1024.0:
            return f"{n:3.1f} {unit}"
        n /= 1024.0
    return f"{n:.1f} PiB"

class SplitWriter:
    def __init__(self, base_path: Path, volume_size: int):
        self.base_path = Path(base_path)
        self.volume_size = volume_size
        self.part_idx = 0
        self.cur_fp = None
        self.cur_written = 0
        self.sha256 = hashlib.sha256()
        self.parts_info = []
        self._open_next()

    def _open_next(self):
        if self.cur_fp:
            self.cur_fp.flush(); self.cur_fp.close()
            self.parts_info.append({
                "path": str(self.cur_path),
                "size": self.cur_written,
                "sha256": self.sha256.hexdigest()
            })
        self.part_idx += 1
        self.cur_path = Path(f"{self.base_path}.part-{self.part_idx:04d}")
        self.cur_fp = open(self.cur_path, "wb")
        self.cur_written = 0
        self.sha256 = hashlib.sha256()

    def write(self, data: bytes):
        mv = memoryview(data)
        offset = 0
        remaining = len(mv)
        while remaining > 0:
            room = self.volume_size - self.cur_written
            if room == 0:
                self._open_next()
                room = self.volume_size
            chunk_len = remaining if remaining <= room else room
            chunk = mv[offset:offset+chunk_len]
            self.cur_fp.write(chunk)
            self.sha256.update(chunk)
            self.cur_written += chunk_len
            offset += chunk_len
            remaining -= chunk_len

    def flush(self):
        if self.cur_fp:
            self.cur_fp.flush()

    def close(self):
        if self.cur_fp:
            self.cur_fp.flush(); self.cur_fp.close()
            self.parts_info.append({
                "path": str(self.cur_path),
                "size": self.cur_written,
                "sha256": self.sha256.hexdigest()
            })
            self.cur_fp = None

    def writable(self): return True

def load_filelist(path: Path):
    files = []
    missing = 0
    total_bytes = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            p = line.strip()
            if not p: continue
            ap = Path(p).expanduser().resolve()
            if ap.is_file():
                files.append(ap)
                try:
                    total_bytes += ap.stat().st_size
                except Exception:
                    pass
            else:
                print(f"[WARN] 檔案不存在，忽略：{p}", file=sys.stderr)
                missing += 1
    return files, total_bytes, missing

def common_base(files):
    try:
        return Path(os.path.commonpath([str(p) for p in files]))
    except Exception:
        prefix = os.path.commonprefix([str(p) for p in files])
        return Path(prefix).resolve().parent

def build_manifest(pack_name, out_dir: Path, filelist_path: Path,
                   parts_info, total_src_files, total_src_bytes, volume_size):
    ts = time.strftime("%Y%m%d_%H%M%S")
    manifest = {
        "pack_name": pack_name,
        "created_at": ts,
        "filelist": str(filelist_path),
        "out_dir": str(out_dir),
        "volume_size": volume_size,
        "total_source_files": total_src_files,
        "total_source_bytes": total_src_bytes,
        "parts": parts_info,
    }
    return manifest

def ensure_folder(drive, parent_id, name):
    q = f'title="{name}" and mimeType="application/vnd.google-apps.folder" and trashed=false and "{parent_id}" in parents'
    flist = drive.ListFile({'q': q}).GetList()
    if flist:
        return flist[0]['id']
    folder = drive.CreateFile({
        'title': name,
        'mimeType': 'application/vnd.google-apps.folder',
        'parents': [{'id': parent_id}]
    })
    folder.Upload()
    return folder['id']

def upload_files_to_drive(local_paths, drive_folder_path, client_secrets="client_secrets.json"):
    if not _HAS_PYDRIVE2:
        raise RuntimeError("未安裝 PyDrive2，請先安裝 pydrive2 套件")
    gauth = GoogleAuth()
    gauth.LoadClientConfigFile(client_secrets)
    gauth.LoadCredentialsFile("token.json")
    if gauth.credentials is None:
        gauth.LocalWebserverAuth()
    elif gauth.access_token_expired:
        gauth.Refresh()
    else:
        gauth.Authorize()
    gauth.SaveCredentialsFile("token.json")

    drive = GoogleDrive(gauth)
    parent = 'root'
    if drive_folder_path.strip():
        for seg in drive_folder_path.strip("/").split("/"):
            parent = ensure_folder(drive, parent, seg)

    for p in local_paths:
        f = drive.CreateFile({'title': Path(p).name, 'parents': [{'id': parent}]})
        f.SetContentFile(p)
        f.Upload()
        print(f"[GDrive] Uploaded: {p}")

## 參數設定

- `FILELIST`: 每行一個**絕對路徑**檔案（建議用你的產生清單腳本）  
- `PACK_NAME`: 分卷前綴名（不含副檔名）  
- `VOLUME_SIZE`: 每卷大小，例：`"1500M"`、`"2G"`、`"800M"`  
- `OUT_DIR`: 產物輸出目錄（預設在清單旁的 `packs/`）  
- `UPLOAD`: 是否上傳到 Google Drive  
- `DRIVE_FOLDER`: Drive 目的資料夾（於 **我的雲端硬碟** 下的路徑）  
- `CLIENT_SECRETS`: Google OAuth `client_secrets.json` 路徑

In [None]:
# === 修改這裡 ===
FILELIST = "/home/yaya/ai-detect-proj/Script/upload_lists/filelist_prnu.txt"
PACK_NAME = "pack_prnu256"
VOLUME_SIZE = "1500M"                 # "1500M" / "2G" / "800M" ...
OUT_DIR = None                        # None -> 產物放在 filelist 同層的 packs/
UPLOAD = False                        # True -> 上傳到 Google Drive
DRIVE_FOLDER = "ai-detect-proj/features"
CLIENT_SECRETS = "./client_secrets.json"  # 放 notebook 同目錄或填絕對路徑

In [None]:
# === 執行打包（tar+zstd 串流→自動分卷） ===
from pathlib import Path

filelist_path = Path(FILELIST).expanduser().resolve()
if OUT_DIR:
    out_dir = Path(OUT_DIR).expanduser().resolve()
else:
    out_dir = filelist_path.parent / "packs"
out_dir.mkdir(parents=True, exist_ok=True)

vol_bytes = parse_size(VOLUME_SIZE)
pack_base = out_dir / PACK_NAME
part_prefix = f"{pack_base}.tar.zst"

files, total_src_bytes, missing = load_filelist(filelist_path)
if not files:
    raise SystemExit("清單中沒有可用檔案")

print(f"== 準備打包 ==")
print(f"  檔案清單：{filelist_path}")
print(f"  輸出前綴：{part_prefix}.part-0001 ...")
print(f"  分卷大小：{VOLUME_SIZE} ({vol_bytes} bytes)")
print(f"  檔案數量：{len(files)}（忽略缺檔 {missing}）")
print(f"  原檔總量：{human_bytes(total_src_bytes)}")
print(f"  輸出目錄：{out_dir}")

base_dir = common_base(files)
print(f"  Tar 相對根：{base_dir}")

splitter = SplitWriter(Path(part_prefix), vol_bytes)
zc = zstd.ZstdCompressor(level=6, threads=-1)
with zc.stream_writer(splitter) as zfh:
    with tarfile.open(mode="w|", fileobj=zfh, format=tarfile.GNU_FORMAT) as tar:
        pbar = tqdm(total=len(files), desc="Packing", unit="file")
        for src in files:
            try:
                arcname = os.path.relpath(src, base_dir)
            except Exception:
                arcname = Path(src).name
            tar.add(src, arcname=arcname, recursive=False)
            pbar.update(1)
        pbar.close()

splitter.close()
parts_info = splitter.parts_info

ts = time.strftime("%Y%m%d_%H%M%S")
manifest_path = out_dir / f"{PACK_NAME}_MANIFEST_{ts}.json"
sums_path = out_dir / f"{PACK_NAME}_SHA256SUMS_{ts}.txt"

manifest = build_manifest(PACK_NAME, out_dir, filelist_path,
                          parts_info, len(files), total_src_bytes, VOLUME_SIZE)

with open(manifest_path, "w", encoding="utf-8") as f:
    json.dump(manifest, f, ensure_ascii=False, indent=2)

with open(sums_path, "w", encoding="utf-8") as f:
    for pi in parts_info:
        f.write(f"{pi['sha256']}  {os.path.basename(pi['path'])}\n")

print("\n== 產物 ==")
tot_packed = sum(p["size"] for p in parts_info)
for pi in parts_info[:10]:
    print(f"  {os.path.basename(pi['path'])}  {human_bytes(pi['size'])}  sha256={pi['sha256'][:12]}...")
if len(parts_info) > 10:
    print(f"  ...（共 {len(parts_info)} 卷）")
print(f"  合計串流大小：{human_bytes(tot_packed)}")
print(f"  MANIFEST：{manifest_path.name}")
print(f"  SHA256SUMS：{sums_path.name}")

In [None]:
# === （可選）上傳到 Google Drive ===
if UPLOAD:
    if not _HAS_PYDRIVE2:
        raise RuntimeError("未安裝 pydrive2，請先在上面安裝後再重跑。")
    to_upload = [pi["path"] for pi in parts_info] + [str(manifest_path), str(sums_path)]
    print(f"== 上傳至 Google Drive: {DRIVE_FOLDER} ==")
    upload_files_to_drive(to_upload, DRIVE_FOLDER, client_secrets=CLIENT_SECRETS)
    print("✅ 上傳完成")
else:
    print("（UPLOAD=False，略過上傳）")

## 在 Colab / 另一台機器還原分卷

把分卷檔下載到同一個資料夾後：

```bash
cat PACK_NAME.tar.zst.part-* > PACK_NAME.tar.zst
tar -I zstd -xf PACK_NAME.tar.zst -C <輸出資料夾>
```

或在 Colab：

```python
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!mkdir -p /content/packs
!cp /content/drive/MyDrive/ai-detect-proj/features/PACK_NAME.tar.zst.part-* /content/packs/

!cat /content/packs/PACK_NAME.tar.zst.part-* > /content/PACK_NAME.tar.zst
!tar -I zstd -xf /content/PACK_NAME.tar.zst -C /content/output
```
