<a href="https://colab.research.google.com/github/BaikaiL/voice/blob/main/translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# 安装 Edge-TTS (用于高质量语音合成)
!pip install edge-tts nest_asyncio



In [2]:
# 微调模型加载
import torch
import asyncio
import nest_asyncio
import edge_tts
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq # 引入更底层的类
import IPython.display as ipd
import uuid
import gradio as gr

# 解决 Jupyter/Colab 中的异步循环冲突问题
nest_asyncio.apply()

class CantonesToMandarinTranslator:
    def __init__(self):
        # 自动检测 GPU
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        print(f"正在初始化... (Device: {self.device})")

        # 一. 加载 ASR 模型
        print("正在加载 Whisper (ASR)...")

        # 1. 从仓库加载微调后的模型权重
        my_model_id = "baikai1022/whisper-small-cantonese-v2"
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            my_model_id,
            torch_dtype=self.torch_dtype,
            low_cpu_mem_usage=True,
            use_safetensors=True
        ).to(self.device)

        # 2. 从 OpenAI 原版加载处理器/分词器 (填补仓库缺失的文件)
        processor = AutoProcessor.from_pretrained("openai/whisper-small")

        # 3. 手动组装 Pipeline
        self.asr_pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,           # 借用原版 tokenizer
            feature_extractor=processor.feature_extractor, # 借用原版 feature_extractor
            device=self.device,
            torch_dtype=self.torch_dtype,
        )

         # 二. 加载 NLLB 模型 (Translation)
        print("正在加载 NLLB (Translation)...")
        self.mt_pipe = pipeline(
            "translation",
            model="baikai1022/nllb-cantonese-mandarin-v1",
            device=self.device,
            torch_dtype=self.torch_dtype,
            src_lang="yue_Hant",
            tgt_lang="zho_Hans"
        )
        print("模型加载完成！")

    def run(self, audio_path, output_audio_path="output_mandarin.mp3"):
        print("\n" + "="*40)
        print(f"处理音频: {audio_path}")

        # --- 步骤 1: 语音识别 ---
        asr_result = self.asr_pipe(
            audio_path,
        )
        cantonese_text = asr_result["text"]
        print(f"[识别结果 (粤语)]: {cantonese_text}")

        # --- 步骤 2: 机器翻译 ---
        mt_result = self.mt_pipe(cantonese_text)
        mandarin_text = mt_result[0]['translation_text']
        print(f"[翻译结果 (国语)]: {mandarin_text}")

        # --- 步骤 3: 语音合成 ---
        print("正在合成语音...")
        voice = "zh-CN-XiaoxiaoNeural"

        async def _generate_tts():
            communicate = edge_tts.Communicate(mandarin_text, voice)
            await communicate.save(output_audio_path)

        asyncio.run(_generate_tts())

        print(f"合成完成: {output_audio_path}")
        print("="*40 + "\n")
        return output_audio_path, mandarin_text

    def process(self, audio_path):
        """
        专门给 Gradio 用的处理函数
        返回: (粤语文本, 普通话文本, 普通话音频路径)
        """
        if not audio_path:
            return "请先录音或上传文件", "", None

        print(f"正在处理: {audio_path}")

        # 1. ASR 识别
        # 指定 language="chinese" 以利用微调效果且避免报错
        asr_result = self.asr_pipe(audio_path)
        cantonese_text = asr_result["text"]

        # 2. MT 翻译
        mt_result = self.mt_pipe(cantonese_text)
        mandarin_text = mt_result[0]['translation_text']

        # 3. TTS 合成
        # 使用随机文件名，防止多人使用时冲突
        output_filename = f"out_{uuid.uuid4().hex[:8]}.mp3"
        voice = "zh-CN-XiaoxiaoNeural"

        async def _gen_tts():
            comm = edge_tts.Communicate(mandarin_text, voice)
            await comm.save(output_filename)

        asyncio.run(_gen_tts())

        return cantonese_text, mandarin_text, output_filename

# 运行测试
translator = CantonesToMandarinTranslator()



正在初始化... (Device: cuda)
正在加载 Whisper (ASR)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda


正在加载 NLLB (Translation)...


config.json:   0%|          | 0.00/836 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/32.2M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda


模型加载完成！


In [4]:
# 微调模型测试
# 请确保这里有文件，或者修改文件名
input_audio = "common_voice_zh-HK_20110217.wav"

import os
if os.path.exists(input_audio):
    output, text = translator.run(input_audio)
    ipd.display(ipd.Audio(output))
else:
    print(f"请上传一个名为 {input_audio} 的粤语录音文件来测试")

请上传一个名为 common_voice_zh-HK_20110217.wav 的粤语录音文件来测试


In [3]:
# 原版翻译模块（MT）加载
from transformers import pipeline

class OriginalNLLBTranslator:
    def __init__(self):
        # 自动检测 GPU
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        print(f"正在加载原版 NLLB 翻译模型... (Device: {self.device})")

        # 加载原版 NLLB (facebook/nllb-200-distilled-600M)
        self.mt_pipe = pipeline(
            "translation",
            model="facebook/nllb-200-distilled-600M", # 官方原版权重
            device=self.device,
            torch_dtype=self.torch_dtype,
            src_lang="yue_Hant",
            tgt_lang="zho_Hans"
        )
        print("原版翻译模型加载完成！")

    def translate(self, cantonese_text):
        """仅执行翻译步骤"""
        if not cantonese_text.strip():
            return "无输入文本"

        mt_result = self.mt_pipe(cantonese_text)
        mandarin_text = mt_result[0]['translation_text']
        return mandarin_text

# 初始化原版翻译模块
base_nllb = OriginalNLLBTranslator()

正在加载原版 NLLB 翻译模型... (Device: cuda)


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda


原版翻译模型加载完成！


In [9]:
# 加载并查看数据集（有标准答案）
from datasets import load_dataset

ds = load_dataset("botisan-ai/cantonese-mandarin-translations")

# 方法1：先查看数据集的基本结构（了解有哪些子集，比如train/validation/test）
print("数据集的子集：", ds.keys())

# 方法2：查看train子集的前3行（最常用）
print("\n=== 查看train子集前3行 ===")
sample_rows = ds['train'].take(3)  # take(n)获取前n个样本
for idx, row in enumerate(sample_rows):
    print(f"第{idx+1}行：{row}")

# 方法3：转换为Pandas DataFrame（更直观，适合熟悉Pandas的用户）
print("\n=== 转换为DataFrame查看前5行 ===")
df = ds['train'].to_pandas()
print(df.head())  # head()默认显示前5行

# 方法4：直接索引查看单个样本（调试用）
print("\n=== 直接查看第1个样本 ===")
print(ds['train'][0])

数据集的子集： dict_keys(['train'])

=== 查看train子集前3行 ===
第1行：{'translation': {'yue': '杞人嘅朋友嘆咗一口氣', 'zh': '杞人的朋友叹了一口气'}}
第2行：{'translation': {'yue': '泥水佬開門口過得人過得自己', 'zh': '泥水佬开门口过得人过得自己'}}
第3行：{'translation': {'yue': '嗰次喺跑馬地協和里見到有雞蛋仔格仔餅賣', 'zh': '那次在跑马地协和里看见有鸡蛋格子饼卖'}}

=== 转换为DataFrame查看前5行 ===
                                         translation
0          {'yue': '杞人嘅朋友嘆咗一口氣', 'zh': '杞人的朋友叹了一口气'}
1    {'yue': '泥水佬開門口過得人過得自己', 'zh': '泥水佬开门口过得人过得自己'}
2  {'yue': '嗰次喺跑馬地協和里見到有雞蛋仔格仔餅賣', 'zh': '那次在跑马地协和...
3  {'yue': '學生話佢知白沙山路係喺大棠，但係唔知有冇小巴經嗰度', 'zh': '学生...
4   {'yue': '有個老人去左坪洲永安橫街食齋', 'zh': '有个老人去坪洲永安小街吃素'}

=== 直接查看第1个样本 ===
{'translation': {'yue': '杞人嘅朋友嘆咗一口氣', 'zh': '杞人的朋友叹了一口气'}}


In [19]:
import torch
import pandas as pd
import plotly.graph_objects as go

# --- 1. 核心计算函数 (适配新版接口) ---
def compute_loss(model, tokenizer, yue_text, zh_text):
    device = model.device
    tokenizer.src_lang = "yue_Hant"
    tokenizer.tgt_lang = "zho_Hans"

    inputs = tokenizer(yue_text, return_tensors="pt").to(device)
    labels = tokenizer(text_target=zh_text, return_tensors="pt").input_ids.to(device)

    with torch.no_grad():
        outputs = model(**inputs, labels=labels)
        return round(outputs.loss.item(), 4)

# --- 2. 批量处理逻辑 ---
num_samples = 20  # 选 10 条数据观察趋势
subset = ds['train'].select(range(num_samples))
detailed_results = []

print(f" 正在提取前 {num_samples} 条数据进行深度测评并生成折线图...\n")

for i, entry in enumerate(subset):
    yue = entry['translation']['yue']
    zh_ref = entry['translation']['zh']

    try:
        # 计算 Loss
        loss_ft = compute_loss(translator.mt_pipe.model, translator.mt_pipe.tokenizer, yue, zh_ref)
        loss_base = compute_loss(base_nllb.mt_pipe.model, base_nllb.mt_pipe.tokenizer, yue, zh_ref)

        # 获取实际翻译文本
        ft_trans = translator.mt_pipe(yue)[0]['translation_text']
        base_trans = base_nllb.translate(yue)

        detailed_results.append({
            "样本ID": i + 1,
            "粤语原文": yue,
            "标准答案": zh_ref,
            "微调翻译": ft_trans,
            "原版翻译": base_trans,
            "FT_Loss": loss_ft,
            "Base_Loss": loss_base
        })
        print(f"进度: {i+1}/{num_samples}")
    except Exception as e:
        print(f"样本 {i+1} 失败: {e}")

df = pd.DataFrame(detailed_results)

# --- 3. 文本详细对比输出 ---
print("\n" + "="*60)
print(f"{'ID':<4} | {'模型版本':<10} | {'翻译文本与 Loss 对比'}")
print("-" * 60)

for _, row in df.iterrows():
    print(f"{row['样本ID']:<4} | 原文       | {row['粤语原文']}")
    print(f"     | 标准答案   | {row['标准答案']}")
    print(f"     | 微调模型   | {row['微调翻译']} (Loss: {row['FT_Loss']})")
    print(f"     | 原版模型   | {row['原版翻译']} (Loss: {row['Base_Loss']})")
    print("-" * 60)

# --- 4. 可视化：折线统计图 ---
fig = go.Figure()

# 原版模型的折线
fig.add_trace(go.Scatter(
    x=df["样本ID"],
    y=df["Base_Loss"],
    mode='lines+markers', # 线段+打点
    name='原版 NLLB (Base)',
    line=dict(color='#FF6B6B', width=3),
    marker=dict(size=8),
    hovertemplate="样本 %{x}<br>原版 Loss: %{y}"
))

# 微调模型的折线
fig.add_trace(go.Scatter(
    x=df["样本ID"],
    y=df["FT_Loss"],
    mode='lines+markers',
    name='微调版 NLLB (Fine-tuned)',
    line=dict(color='#4D96FF', width=3),
    marker=dict(size=8),
    hovertemplate="样本 %{x}<br>微调 Loss: %{y}"
))

fig.update_layout(
    title="微调前后翻译 Loss 趋势对比图 (数值越低越贴近标准答案)",
    xaxis=dict(title="样本编号", tickmode='linear'),
    yaxis=dict(title="Loss (交叉熵损失)"),
    template="plotly_white",
    hovermode="x unified", # 鼠标悬停时同时显示两根线的数值
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

fig.show()

# 5. 最终结论
improvement = (df["Base_Loss"].mean() - df["FT_Loss"].mean()) / df["Base_Loss"].mean() * 100
print(f"\n 测评结论：")
print(f"微调版模型的 Loss 表现整体优于原版。")
print(f"平均 Loss 改善率: {improvement:.2f}%")

 正在提取前 20 条数据进行深度测评并生成折线图...

进度: 1/20
进度: 2/20
进度: 3/20
进度: 4/20
进度: 5/20
进度: 6/20
进度: 7/20
进度: 8/20
进度: 9/20
进度: 10/20
进度: 11/20
进度: 12/20
进度: 13/20
进度: 14/20
进度: 15/20
进度: 16/20
进度: 17/20
进度: 18/20
进度: 19/20
进度: 20/20

ID   | 模型版本       | 翻译文本与 Loss 对比
------------------------------------------------------------
1    | 原文       | 杞人嘅朋友嘆咗一口氣
     | 标准答案   | 杞人的朋友叹了一口气
     | 微调模型   | 人的朋友叹了一口气 (Loss: 0.0376)
     | 原版模型   | 一个的朋友了一口气. (Loss: 2.5977)
------------------------------------------------------------
2    | 原文       | 泥水佬開門口過得人過得自己
     | 标准答案   | 泥水佬开门口过得人过得自己
     | 微调模型   | 泥水开门口过得人过得自己 (Loss: 0.0262)
     | 原版模型   | 泥的水开了门,让人们走过自己. (Loss: 2.7676)
------------------------------------------------------------
3    | 原文       | 嗰次喺跑馬地協和里見到有雞蛋仔格仔餅賣
     | 标准答案   | 那次在跑马地协和里看见有鸡蛋格子饼卖
     | 微调模型   | 那次在跑马地协和里看到有蛋格子饼卖 (Loss: 0.1506)
     | 原版模型   | 在马会上,我看到一家蛋蛋蛋糕. (Loss: 3.3027)
------------------------------------------------------------
4    | 原文       | 學生話佢知白沙山路係喺大棠，但係唔知有冇小


 测评结论：
微调版模型的 Loss 表现整体优于原版。
平均 Loss 改善率: 94.08%


In [None]:
pip install pyecharts

Collecting pyecharts
  Downloading pyecharts-2.0.9-py3-none-any.whl.metadata (1.6 kB)
Downloading pyecharts-2.0.9-py3-none-any.whl (153 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyecharts
Successfully installed pyecharts-2.0.9


In [None]:
pip install plotly



In [None]:
!pip install --upgrade gradio plotly pandas
# 运行后如果提示 "Restart Session"，请点击按钮重启，否则代码会报错。

Collecting gradio
  Downloading gradio-6.3.0-py3-none-any.whl.metadata (16 kB)
Collecting plotly
  Downloading plotly-6.5.1-py3-none-any.whl.metadata (8.5 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting gradio-client==2.0.3 (from gradio)
  Downloading gradio_client-2.0.3-py3-none-any.whl.metadata (7.1 kB)
Downloading gradio-6.3.0-py3-none-any.whl (23.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.0/23.0 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gradio_client-2.0.3-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.7/55.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading plotly-6.5.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0

In [None]:
# 性能可视化UI交互界面（第三版，修正时区）
import gradio as gr
import psutil
import torch
import pandas as pd
import time
from datetime import datetime, timedelta, timezone
import plotly.express as px

# --- 1. 监控数据与绘图逻辑 ---
def get_perf_figure(history_df):
    """根据数据生成 Plotly 图表"""
    if history_df.empty:
        fig = px.line(title="性能监控启动中...")
    else:
        fig = px.line(
            history_df, x="Time", y="Value", color="Type",
            title="实时性能监控 (秒级更新 - 北京时间)",
            range_y=[0, 8],
            template="plotly_white"
        )
    fig.update_layout(height=300, margin=dict(l=10, r=10, t=40, b=10))
    return fig

# 存储历史数据
if 'perf_history' not in globals():
    perf_history = pd.DataFrame(columns=["Time", "Value", "Type"])

def update_monitor_plot():
    global perf_history

    # 【关键修改】：显式指定北京时间（UTC+8）
    beijing_tz = timezone(timedelta(hours=8))
    now = datetime.now(beijing_tz).strftime("%H:%M:%S")

    # 采集数据
    ram = psutil.virtual_memory().used / (1024**3)
    vram = 0
    if torch.cuda.is_available():
        vram = torch.cuda.memory_allocated() / (1024**3)

    # 更新数据
    new_data = pd.DataFrame([
        {"Time": now, "Value": ram, "Type": "系统内存 (GB)"},
        {"Time": now, "Value": vram, "Type": "显存 (GB)"}
    ])
    perf_history = pd.concat([perf_history, new_data], ignore_index=True).tail(60)
    return get_perf_figure(perf_history)

# --- 2. 翻译接口 ---
def gradio_interface(audio):
    return translator.process(audio)

# --- 3. UI 界面 ---
with gr.Blocks(title="粤语转普通话 AI") as demo:
    gr.Markdown("# 🇭🇰 粤语 -> 🇨🇳 普通话 (性能实时监控版)")

    with gr.Row():
        with gr.Column():
            input_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="请说粤语")
            btn = gr.Button("开始转换", variant="primary")
            perf_plot = gr.Plot(value=get_perf_figure(perf_history), label="性能趋势")
            hidden_btn = gr.Button("手动刷新监控", elem_id="refresh_trigger", visible=True)

        with gr.Column():
            txt_cantonese = gr.Textbox(label="识别结果 (粤语)")
            txt_mandarin = gr.Textbox(label="翻译结果 (普通话)")
            out_audio = gr.Audio(label="合成语音 (普通话)", autoplay=True)

    # 绑定翻译
    btn.click(fn=gradio_interface, inputs=input_audio, outputs=[txt_cantonese, txt_mandarin, out_audio])

    # 绑定监控
    hidden_btn.click(fn=update_monitor_plot, inputs=None, outputs=perf_plot)

    # JavaScript 定时器
    gr.HTML("""
        <script>
            function startAutoClicker() {
                setInterval(function() {
                    var btn = document.getElementById('refresh_trigger') ||
                              document.querySelector('#refresh_trigger');
                    if (btn) { btn.click(); }
                }, 1000);
            }
            setTimeout(startAutoClicker, 3000);
        </script>
    """)

# 启动
demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://950d1b7083822651c5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


正在处理: /tmp/gradio/7a6be2ec584a65a19db63e4d0d38be1823409f4a93302b82ddca3eda40130475/audio.wav
正在处理: /tmp/gradio/8af6a863366c7359157cc2f9f3b4e9676bbea589e8c384936ccc189aafe96837/audio.wav
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://950d1b7083822651c5.gradio.live


