# 英文评论批量翻译（带断点续传 & GPU 加速）
本 Notebook 使用 Helsinki‑NLP/opus‑mt‑en‑zh 模型，将 CSV 中的英文评论翻译为中文。

- ✅ 支持断点续传，每批翻译后即时保存。
- ✅ 支持 GPU 加速，翻译速度更快。
- ✅ 翻译完成后自动下载 `translated_output.csv`。


In [ ]:
# 安装必要依赖
!pip install -q langdetect transformers sentencepiece pandas tqdm

In [ ]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import MarianMTModel, MarianTokenizer
from langdetect import detect, LangDetectException
from google.colab import files
import os


In [ ]:
# 设置设备 & 加载模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'Helsinki-NLP/opus-mt-en-zh'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)
print("✅ 模型加载完成，使用设备：", device)

In [ ]:
# 上传你的 CSV 文件（必须包含 `text` 列）
uploaded = files.upload()
input_file = list(uploaded.keys())[0]
print("✅ 上传文件：", input_file)

In [ ]:
# 读取 CSV（自动处理编码 & 异常行）
try:
    df = pd.read_csv(input_file, encoding='utf-8', on_bad_lines='skip')
except Exception:
    df = pd.read_csv(input_file, encoding='gbk', on_bad_lines='skip')
print(f"📄 总共 {len(df)} 条评论待处理，前5条预览：")
df.head()

In [ ]:
# 检查是否已有部分翻译文件
partial_file = 'translated_partial.csv'
if os.path.exists(partial_file):
    df_partial = pd.read_csv(partial_file, encoding='utf-8')
    done_indices = set(df_partial.index)
    df.loc[done_indices, 'text'] = df_partial.loc[done_indices, 'text']
    print(f"🔄 已加载 {len(done_indices)} 条已翻译内容，继续从中断位置处理。")
else:
    done_indices = set()
    print("🔄 无中断续传记录，从头开始翻译。")

In [ ]:
# 定义语言检测和翻译函数
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

def translate_batch(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(**inputs, max_length=512)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [ ]:
# 批量翻译 + 断点续传保存
batch_size = 16
total = len(df)
for i in tqdm(range(0, total, batch_size), desc="翻译进度"):
    batch_df = df.iloc[i:i+batch_size]
    batch_idx = batch_df.index.tolist()
    if all(idx in done_indices for idx in batch_idx):
        continue
    texts = batch_df['text'].fillna('').astype(str).tolist()
    mask = [is_english(t) for t in texts]
    if any(mask):
        to_translate = [t for t, m in zip(texts, mask) if m]
        try:
            translated = translate_batch(to_translate)
        except Exception as e:
            print(f"⚠️ 批次发生错误，跳过：{e}")
            translated = to_translate
        it = iter(translated)
        new_texts = [next(it) if m else t for t, m in zip(texts, mask)]
    else:
        new_texts = texts
    df.loc[batch_idx, 'text'] = new_texts
    done_indices.update(batch_idx)
    df.loc[list(batch_idx), ['text']].to_csv(partial_file, index=list(batch_idx), header=True, encoding='utf-8-sig')

In [ ]:
# 翻译完成，导出最终文件并下载
output_file = 'translated_output.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')
files.download(output_file)
print(f"✅ 翻译完成，已生成文件：{output_file}")