In [1]:
import os
import numpy as np
import librosa
import pyworld

In [14]:
# 加载音频文件
def load_wav(wavfile, sr=16000):
    """
    加载音频文件并返回音频信号
    """
    wav, _ = librosa.load(wavfile, sr=sr, mono=True)
    return wav

In [3]:
# 提取F0特征
def extract_f0(wavfile, sr=22050, frame_period=5.0):
    """
    提取音频的基频(F0)特征
    """
    wav = load_wav(wavfile, sr)
    f0, timeaxis = pyworld.harvest(wav.astype(np.float64), sr, frame_period=frame_period)
    return f0

In [8]:
# 计算F0的皮尔逊相关系数 (PCC)
def calculate_f0_pcc(f0_true, f0_pred):
    """
    计算基频F0的皮尔逊相关系数 (PCC)
    """
    # 去除0值的帧，并确保两个序列长度对齐
    non_zero_idx_true = np.where(f0_true > 0)[0]
    non_zero_idx_pred = np.where(f0_pred > 0)[0]

    # 确保两个非零F0序列的长度一致，截取较短的部分
    min_length = min(len(non_zero_idx_true), len(non_zero_idx_pred))
    if min_length == 0:
        return float('nan')  # 如果没有有效的非零F0帧，返回nan

    f0_true_aligned = f0_true[non_zero_idx_true[:min_length]]
    f0_pred_aligned = f0_pred[non_zero_idx_pred[:min_length]]

    # 计算皮尔逊相关系数
    if len(f0_true_aligned) > 1 and len(f0_pred_aligned) > 1:
        pcc = np.corrcoef(f0_true_aligned, f0_pred_aligned)[0, 1]
    else:
        pcc = float('nan')  # 如果F0序列太短，返回nan

    return pcc


In [5]:
# 主流程函数
def process_f0_pcc(mapping_file_path, original_base_path, generated_base_path):
    """
    从映射文件中提取源语音和转换语音，计算F0的PCC
    """
    total_pcc = 0
    count = 0

    # 读取映射文件
    with open(mapping_file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('|')

            # 检查是否有足够的字段
            if len(parts) != 3:
                print(f"Invalid line format: {line}")
                continue

            generated_wav, content_wav, _ = parts

            # 构建音频文件路径
            original_wav_path = os.path.join(original_base_path, content_wav)
            generated_wav_path = os.path.join(generated_base_path, generated_wav)

            # 提取F0特征
            f0_true = extract_f0(original_wav_path)
            f0_pred = extract_f0(generated_wav_path)

            # 计算当前文件的F0 PCC
            pcc = calculate_f0_pcc(f0_true, f0_pred)
            print(f"PCC for {generated_wav}: {pcc}")

            if not np.isnan(pcc):
                total_pcc += pcc
                count += 1

    # 计算平均PCC
    avg_pcc = total_pcc / count if count > 0 else float('nan')
    print(f"\nAverage F0 PCC: {avg_pcc}")

In [12]:
# 示例用法
mapping_file_path = '/home/sun/FreeVC/convert_S.txt'  # 更新后的转换文件路径
original_base_path = '/home/sun/FreeVC/outputs/wav_o'  # 原始音频目录
generated_base_path = '/home/sun/FreeVC/outputs/starganvc_test'  # 生成音频目录

In [15]:
process_f0_pcc(mapping_file_path, original_base_path, generated_base_path)

PCC for WTM1_S.wav: 0.5327657307598203
PCC for WTM2_S.wav: 0.1722772494817672
PCC for WTM3_S.wav: 0.04528173770289166
PCC for WTM4_S.wav: 0.28992184635396856
PCC for WTM5_S.wav: 0.20629718188910542
PCC for WTM6_S.wav: -0.07141008899422654
PCC for WTM7_S.wav: 0.09289963602599842
PCC for WTM8_S.wav: 0.11303759519652687
PCC for WTM9_S.wav: 0.32508123385736065
PCC for WTM10_S.wav: -0.2774398921528925
PCC for WTM11_S.wav: 0.3392709772880627
PCC for WTM12_S.wav: 0.15000551403027526
PCC for MTW1_S.wav: -0.010623376329028256
PCC for MTW2_S.wav: 0.36273682671078783
PCC for MTW3_S.wav: -0.1799338145113961
PCC for MTW4_S.wav: 0.0677391708721042
PCC for MTW5_S.wav: -0.003450363208532004
PCC for MTW6_S.wav: -0.1062303820001645
PCC for MTW7_S.wav: -0.07900669691861933
PCC for MTW8_S.wav: 0.10440242273844742
PCC for MTW9_S.wav: -0.030626192785299353
PCC for MTW10_S.wav: 0.23090431616461843
PCC for MTW11_S.wav: 0.2822790956375842
PCC for MTW12_S.wav: 0.013869097876842427
PCC for WTW1_S.wav: 0.68394518