In [None]:
import sys
import threading
import time
import re
import csv
from pathlib import Path

import pandas as pd
from groq import Groq
csv.field_size_limit(sys.maxsize)

status = {"done": False, "total": 0, "count": 0}
_lock = threading.Lock()

def _report():
    while True:
        with _lock:
            if status["done"]:
                break
            c, t = status["count"], status["total"]
        print(f"[{time.strftime('%H:%M:%S')}] 已翻译行: {c}/{t}", flush=True)
        time.sleep(5)

GROQ_API_KEY = "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"
API_MODEL    = "llama-3.1-8b-instant"
INPUT_DIR    = Path.home() / "Desktop/history of sport"
OUTPUT_DIR   = Path.home() / "Desktop/sports"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

client = Groq(api_key=GROQ_API_KEY)

def flatten(text: str) -> str:
    s = str(text or "")
    s = re.sub(r'[\r\n\u2028\u2029]+', ' ', s)
    return re.sub(r'\s{2,}', ' ', s).strip()

MAX_CHARS = 20000

def safe_translate(chunk: str) -> str:
    if not chunk:
        return ""
    prompt = "You are a precise multilingual→English translator. Return ONLY the English translation."
    try:
        rsp = client.chat.completions.create(
            model=API_MODEL,
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user",   "content": chunk},
            ],
            temperature=0.0
        )
        out = rsp.choices[0].message.content.strip()
        out = out.replace("\n", " ")
        out = re.sub(r'\s{2,}', ' ', out)
        return out
    except Exception as e:
        print(f"[Error] 翻译长度={len(chunk)} 的 chunk 失败: {e}", file=sys.stderr)
        if len(chunk) == 1:
            return chunk
        mid = len(chunk) // 2
        return safe_translate(chunk[:mid]) + safe_translate(chunk[mid:])

def translate_text(text: str) -> str:
    chunks = [text[i : i + MAX_CHARS] for i in range(0, len(text), MAX_CHARS)]
    translations = []
    for chunk in chunks:
        translations.append(safe_translate(chunk))
        time.sleep(0.1)
    return "".join(translations)

EXCEL_CELL_LIMIT = 32767

def split_into_n_parts(s: str, n: int):
    """把字符串均匀切成 n 份。"""
    if n <= 1:
        return [s]
    L = len(s or "")
    if L == 0:
        return [""] * n
    base = L // n
    rem  = L % n
    out = []
    start = 0
    for i in range(n):
        seg_len = base + (1 if i < rem else 0)
        out.append(s[start:start+seg_len])
        start += seg_len
    return out

def ensure_excel_safe_rows(fname: str, content: str, translation: str):
    need_c = (len(content) // EXCEL_CELL_LIMIT) + 1 if len(content) > EXCEL_CELL_LIMIT else 1
    need_t = (len(translation) // EXCEL_CELL_LIMIT) + 1 if len(translation) > EXCEL_CELL_LIMIT else 1
    n = max(need_c, need_t)
    if n <= 1:
        return [(fname, content, translation)]
    c_parts = split_into_n_parts(content, n)
    t_parts = split_into_n_parts(translation, n)
    if len(c_parts) != len(t_parts):
        m = max(len(c_parts), len(t_parts))
        while len(c_parts) < m:
            c_parts.append("")
        while len(t_parts) < m:
            t_parts.append("")
    return [(fname, c_parts[i], t_parts[i]) for i in range(len(c_parts))]

def main():
    threading.Thread(target=_report, daemon=True).start()
    all_files = sorted(INPUT_DIR.glob("*.csv"), key=lambda p: p.name)
    files = all_files[200000:240000]

    if not files:
        print("❌ 未找到任何 CSV 文件", file=sys.stderr)
        return

    total = sum(len(pd.read_csv(f, encoding="utf-8-sig", dtype=str)) for f in files)
    with _lock:
        status["total"] = total

    cache = {}

    for f in files:
        df = pd.read_csv(f, encoding="utf-8-sig", dtype=str)
        out_rows = []
        for _, row in df.iterrows():
            fname = row.get("filename", "")
            raw   = flatten(row.get("content", ""))
            key   = (fname, raw)
            if key in cache:
                tr = cache[key]
            else:
                tr = translate_text(raw)
                cache[key] = tr

            with _lock:
                status["count"] += 1

            safe_rows = ensure_excel_safe_rows(fname, raw, tr)
            for sfname, scontent, strans in safe_rows:
                out_rows.append({
                    "filename": sfname,
                    "content":  scontent,
                    "English translation": strans
                })
        out_df = pd.DataFrame(out_rows, columns=["filename","content","English translation"])
        target = OUTPUT_DIR / f"{f.stem}_translated.csv"
        out_df.to_csv(
            target,
            index=False,
            quoting=csv.QUOTE_ALL,
            line_terminator="\r\n",
            encoding="utf-8-sig"
        )
        print(f"✅ 已保存: {target}")

    with _lock:
        status["done"] = True

if __name__ == "__main__":
    main()