In [24]:
import os
import numpy as np
import librosa
import pyworld

In [25]:
# 加载音频文件
def load_wav(wavfile, sr=22050):
    """
    加载音频文件并返回音频信号
    """
    wav, _ = librosa.load(wavfile, sr=sr, mono=True)
    return wav

In [26]:
# 提取F0特征
def extract_f0(wavfile, sr=22050, frame_period=5.0):
    """
    提取音频的基频(F0)特征
    """
    wav = load_wav(wavfile, sr)
    f0, timeaxis = pyworld.harvest(wav.astype(np.float64), sr, frame_period=frame_period)
    return f0

In [27]:
# 计算RMSE
def calculate_rmse(f0_true, f0_pred):
    """
    计算F0的均方根误差 (RMSE)，确保两个F0特征序列的长度一致
    """
    # 去除0值的帧，并确保两个序列长度对齐
    non_zero_idx_true = np.where(f0_true > 0)[0]
    non_zero_idx_pred = np.where(f0_pred > 0)[0]

    # 确保两个非零F0序列的长度一致，截取较短的部分
    min_length = min(len(non_zero_idx_true), len(non_zero_idx_pred))
    f0_true = f0_true[non_zero_idx_true[:min_length]]
    f0_pred = f0_pred[non_zero_idx_pred[:min_length]]

    # 计算RMSE
    rmse = np.sqrt(np.mean((f0_true - f0_pred) ** 2))
    return rmse


In [28]:
def process_f0_rmse(mapping_file_path, original_base_path, generated_base_path):
    """
    主流程：从映射文件中提取原始音频和生成音频的F0特征，计算F0 RMSE
    """
    total_rmse = 0
    count = 0

    # 读取映射文件
    with open(mapping_file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('|')

            # 检查是否有足够的字段
            if len(parts) != 3:
                print(f"Invalid line format: {line}")
                continue

            generated_wav, content_wav, _ = parts

            # 构建音频文件路径
            original_wav_path = os.path.join(original_base_path, content_wav)
            generated_wav_path = os.path.join(generated_base_path, generated_wav)

            # 提取F0特征
            f0_true = extract_f0(original_wav_path)
            f0_pred = extract_f0(generated_wav_path)

            # 计算当前文件的F0 RMSE
            rmse = calculate_rmse(f0_true, f0_pred)
            print(f"RMSE for {generated_wav}: {rmse} Hz")

            total_rmse += rmse
            count += 1

    # 计算平均RMSE
    avg_rmse = total_rmse / count if count > 0 else float('inf')
    print(f"\nAverage F0 RMSE: {avg_rmse} Hz")


In [31]:
# 示例用法
mapping_file_path = '/home/sun/FreeVC/convert_S.txt' 
original_base_path = '/home/sun/FreeVC/outputs/wav_o'  
generated_base_path = '/home/sun/FreeVC/outputs/starganvc_test'  

In [32]:
process_f0_rmse(mapping_file_path, original_base_path, generated_base_path)

RMSE for WTM1_S.wav: 76.4315257744199 Hz
RMSE for WTM2_S.wav: 60.78550845224539 Hz
RMSE for WTM3_S.wav: 92.02460336705748 Hz
RMSE for WTM4_S.wav: 97.53156392204973 Hz
RMSE for WTM5_S.wav: 74.3373702343773 Hz
RMSE for WTM6_S.wav: 110.22289816941901 Hz
RMSE for WTM7_S.wav: 72.0628311665202 Hz
RMSE for WTM8_S.wav: 78.86170759608765 Hz
RMSE for WTM9_S.wav: 73.60588646850526 Hz
RMSE for WTM10_S.wav: 92.90537425767195 Hz
RMSE for WTM11_S.wav: 74.26556080141985 Hz
RMSE for WTM12_S.wav: 93.3660489867943 Hz
RMSE for MTW1_S.wav: 101.01525133545819 Hz
RMSE for MTW2_S.wav: 101.61324187079661 Hz
RMSE for MTW3_S.wav: 91.45408488328425 Hz
RMSE for MTW4_S.wav: 97.88933908049367 Hz
RMSE for MTW5_S.wav: 92.37019431596147 Hz
RMSE for MTW6_S.wav: 97.5600961762234 Hz
RMSE for MTW7_S.wav: 84.73739517180806 Hz
RMSE for MTW8_S.wav: 73.66994835442111 Hz
RMSE for MTW9_S.wav: 102.21112428096139 Hz
RMSE for MTW10_S.wav: 91.65260286157485 Hz
RMSE for MTW11_S.wav: 88.06985719265988 Hz
RMSE for MTW12_S.wav: 82.21688