In [1]:
import os
import numpy as np
import librosa

In [2]:
# 加载音频文件
def load_wav(wavfile, sr=16000):
    """
    加载音频文件并返回音频信号
    """
    wav, _ = librosa.load(wavfile, sr=sr, mono=True)
    return wav

In [3]:
# 提取对数频谱
def extract_log_spectrum(wavfile, sr=16000, n_fft=1024, hop_length=512):
    """
    提取音频的对数频谱
    """
    wav = load_wav(wavfile, sr)
    # 计算短时傅里叶变换（STFT）
    stft = np.abs(librosa.stft(wav, n_fft=n_fft, hop_length=hop_length)) ** 2
    # 计算功率谱密度
    power_spectrum = np.maximum(stft, 1e-10)  # 避免 log(0)
    # 计算对数频谱
    log_spectrum = 10 * np.log10(power_spectrum)
    return log_spectrum

In [4]:
# 计算LSD
def calculate_lsd(log_spec_ref, log_spec_pred):
    """
    计算对数频谱距离 (LSD)
    log_spec_ref: 源音频的对数频谱
    log_spec_pred: 生成音频的对数频谱
    """
    # 确保频谱的时间帧数相同
    min_length = min(log_spec_ref.shape[1], log_spec_pred.shape[1])
    log_spec_ref = log_spec_ref[:, :min_length]
    log_spec_pred = log_spec_pred[:, :min_length]

    # 计算每帧对数频谱之间的差异
    lsd = np.sqrt(np.mean((log_spec_ref - log_spec_pred) ** 2, axis=0))
    # 取平均 LSD
    return np.mean(lsd)


In [5]:
# 主流程函数
def process_lsd(mapping_file_path, original_base_path, generated_base_path):
    """
    从映射文件中提取源语音和转换语音，计算LSD
    """
    total_lsd = 0
    count = 0

    # 读取映射文件
    with open(mapping_file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('|')

            # 检查是否有足够的字段
            if len(parts) != 3:
                print(f"Invalid line format: {line}")
                continue

            generated_wav, content_wav, _ = parts

            # 构建音频文件路径
            original_wav_path = os.path.join(original_base_path, content_wav)
            generated_wav_path = os.path.join(generated_base_path, generated_wav)

            # 提取对数频谱
            log_spec_ref = extract_log_spectrum(original_wav_path, sr=16000)
            log_spec_pred = extract_log_spectrum(generated_wav_path, sr=16000)

            # 计算当前文件的LSD
            lsd = calculate_lsd(log_spec_ref, log_spec_pred)
            print(f"LSD for {generated_wav}: {lsd} dB")

            total_lsd += lsd
            count += 1

    # 计算平均LSD
    avg_lsd = total_lsd / count if count > 0 else float('inf')
    print(f"\nAverage LSD: {avg_lsd} dB")

In [10]:
# 示例用法
mapping_file_path = '/home/sun/FreeVC/convert_S.txt'  # 更新后的转换文件路径
original_base_path = '/home/sun/FreeVC/outputs/wav_o'  # 原始音频目录
generated_base_path = '/home/sun/FreeVC/outputs/starganvc_test'  # 生成音频目录

In [11]:
process_lsd(mapping_file_path, original_base_path, generated_base_path)

LSD for WTM1_S.wav: 22.67380142211914 dB
LSD for WTM2_S.wav: 28.03598976135254 dB
LSD for WTM3_S.wav: 24.063777923583984 dB
LSD for WTM4_S.wav: 31.461599349975586 dB
LSD for WTM5_S.wav: 30.492700576782227 dB
LSD for WTM6_S.wav: 23.574342727661133 dB
LSD for WTM7_S.wav: 37.937381744384766 dB
LSD for WTM8_S.wav: 27.961679458618164 dB
LSD for WTM9_S.wav: 26.390592575073242 dB
LSD for WTM10_S.wav: 32.9006462097168 dB
LSD for WTM11_S.wav: 25.551027297973633 dB
LSD for WTM12_S.wav: 25.804893493652344 dB
LSD for MTW1_S.wav: 28.983755111694336 dB
LSD for MTW2_S.wav: 19.2847900390625 dB
LSD for MTW3_S.wav: 22.541080474853516 dB
LSD for MTW4_S.wav: 24.247896194458008 dB
LSD for MTW5_S.wav: 35.6830940246582 dB
LSD for MTW6_S.wav: 25.76902961730957 dB
LSD for MTW7_S.wav: 25.161949157714844 dB
LSD for MTW8_S.wav: 22.414928436279297 dB
LSD for MTW9_S.wav: 40.7855224609375 dB
LSD for MTW10_S.wav: 28.969688415527344 dB
LSD for MTW11_S.wav: 27.508275985717773 dB
LSD for MTW12_S.wav: 23.376008987426758 