In [1]:
pip install transformers datasets jiwer

Collecting transformers
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.9.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3

In [2]:
pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import jiwer

In [2]:
# 加载预训练的Wav2Vec 2.0 ASR模型和处理器
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# 加载音频文件
def load_wav(wavfile, sr=16000):
    """
    加载音频文件并返回音频信号
    """
    wav, _ = librosa.load(wavfile, sr=sr, mono=True)
    return wav

In [4]:
# 使用ASR模型将音频转换为文本
def transcribe_audio(wavfile):
    """
    使用ASR模型将音频文件转录为文本
    """
    # 加载音频数据
    wav = load_wav(wavfile)
    
    # 使用processor处理音频
    input_values = processor(wav, return_tensors="pt", sampling_rate=16000).input_values
    
    # 使用模型预测并解码
    with torch.no_grad():
        logits = model(input_values).logits
    
    predicted_ids = torch.argmax(logits, dim=-1)
    
    # 解码为文本
    transcription = processor.decode(predicted_ids[0])
    
    return transcription.lower()

In [5]:
# 计算WER和CER
def calculate_wer_cer(reference_text, hypothesis_text):
    """
    计算词错误率（WER）和字符错误率（CER）
    """
    wer = jiwer.wer(reference_text, hypothesis_text)
    cer = jiwer.cer(reference_text, hypothesis_text)
    
    return wer, cer

In [6]:
# 主流程函数
def process_wer_cer(mapping_file_path, original_base_path, generated_base_path):
    """
    从映射文件中提取源语音和转换语音，计算WER和CER
    """
    total_wer = 0
    total_cer = 0
    count = 0

    # 读取映射文件
    with open(mapping_file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('|')

            # 检查是否有足够的字段
            if len(parts) != 3:
                print(f"Invalid line format: {line}")
                continue

            generated_wav, content_wav, _ = parts

            # 构建音频文件路径
            original_wav_path = os.path.join(original_base_path, content_wav)
            generated_wav_path = os.path.join(generated_base_path, generated_wav)

            # 转录源音频和转换音频
            reference_text = transcribe_audio(original_wav_path)
            hypothesis_text = transcribe_audio(generated_wav_path)

            # 计算WER和CER
            wer, cer = calculate_wer_cer(reference_text, hypothesis_text)
            print(f"WER for {generated_wav}: {wer}, CER: {cer}")

            total_wer += wer
            total_cer += cer
            count += 1

    # 计算平均WER和CER
    avg_wer = total_wer / count if count > 0 else float('inf')
    avg_cer = total_cer / count if count > 0 else float('inf')

    print(f"\nAverage WER: {avg_wer}, Average CER: {avg_cer}")

In [11]:
# 示例用法
mapping_file_path = '/home/sun/FreeVC/convert_S.txt'  # 更新后的转换文件路径
original_base_path = '/home/sun/FreeVC/outputs/wav_o'  # 原始音频目录
generated_base_path = '/home/sun/FreeVC/outputs/starganvc_test'  # 生成音频目录

In [12]:
process_wer_cer(mapping_file_path, original_base_path, generated_base_path)

WER for WTM1_S.wav: 0.6666666666666666, CER: 0.2777777777777778
WER for WTM2_S.wav: 0.09090909090909091, CER: 0.03773584905660377
WER for WTM3_S.wav: 0.5263157894736842, CER: 0.19387755102040816
WER for WTM4_S.wav: 1.0, CER: 0.5
WER for WTM5_S.wav: 0.0, CER: 0.0
WER for WTM6_S.wav: 0.4, CER: 0.1414141414141414
WER for WTM7_S.wav: 1.0, CER: 0.6666666666666666
WER for WTM8_S.wav: 0.0, CER: 0.0
WER for WTM9_S.wav: 0.7, CER: 0.36
WER for WTM10_S.wav: 1.5, CER: 0.29411764705882354
WER for WTM11_S.wav: 0.09090909090909091, CER: 0.05660377358490566
WER for WTM12_S.wav: 0.631578947368421, CER: 0.36363636363636365
WER for MTW1_S.wav: 1.0, CER: 1.0
WER for MTW2_S.wav: 0.09090909090909091, CER: 0.03773584905660377
WER for MTW3_S.wav: 0.15789473684210525, CER: 0.030303030303030304
WER for MTW4_S.wav: 0.06666666666666667, CER: 0.015384615384615385
WER for MTW5_S.wav: 1.0, CER: 0.8888888888888888
WER for MTW6_S.wav: 0.0, CER: 0.0
WER for MTW7_S.wav: 0.15, CER: 0.04081632653061224
WER for MTW8_S.wav: