In [1]:
import torch
import torchaudio
import bigvgan
import librosa
import soundfile as sf
from vocos import Vocos
import speechbrain as sb
from speechbrain.utils.fetching import fetch
from speechbrain.utils.data_utils import split_path
from speechbrain.lobes.models.FastSpeech2 import mel_spectogram
from speechbrain.inference.vocoders import HIFIGAN
from speechbrain.inference.vocoders import DiffWaveVocoder
from transformers import UnivNetFeatureExtractor, UnivNetModel
import librosa
import librosa.display
import numpy as np
from pathlib import Path
from scipy.io.wavfile import write
import IPython.display as ipd
from tqdm import tqdm
import os
import time
from torch import nn
import pandas as pd

  if ismodule(module) and hasattr(module, '__file__'):


In [2]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Przetworzenie plików za pomocą modeli

In [3]:
def get_mel_from_file(file_path, n_mels, sr=22050, compression=True):
    signal, rate = librosa.load(file_path, sr=sr, mono=True)
    signal = torch.FloatTensor(signal)
    spectrogram, _ = mel_spectogram(
        audio=signal,
        sample_rate=rate,
        hop_length=256,
        win_length=1024,
        n_mels=n_mels,
        n_fft=1024,
        f_min=0.0,
        f_max=8000.0,
        power=1,
        normalized=False,
        min_max_energy_norm=True,
        norm="slaney",
        mel_scale="slaney",
        compression=compression
    )

    return spectrogram

In [4]:
DATASET = 'cremad'

In [5]:
files_to_process = [str(file) for file in Path(f'{DATASET}/noised_data').rglob('*') if file.is_file()]

In [61]:
len(files_to_process)

1439

### HiFi-GAN

In [6]:
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-libritts-22050Hz", savedir="pretrained_models/tts-hifigan-libritts-22050Hz")

times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 80)
    
    start = time.time()
    waveforms = hifi_gan.decode_batch(spectrogram)
    end = time.time()
    times.append(end-start)

    if not os.path.exists(f'{DATASET}/noised_generated/hifigan'):
        os.makedirs(f'{DATASET}/noised_generated/hifigan')
    torchaudio.save(f'{DATASET}/noised_generated\\hifigan\\' + file_path[18:], waveforms.squeeze(1), 22050)
print('Średni czas przetwarzania:', np.mean(times))

  WeightNorm.apply(module, name, dim)
  state_dict = torch.load(path, map_location=device)
100%|██████████| 744/744 [13:04<00:00,  1.05s/it]

Średni czas przetwarzania: 1.0233350021223868





### Vocos

In [7]:
class FeatureExtractor(nn.Module):
    """Base class for feature extractors."""

    def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor:
        """
        Extract features from the given audio.

        Args:
            audio (Tensor): Input audio waveform.

        Returns:
            Tensor: Extracted features of shape (B, C, L), where B is the batch size,
                    C denotes output features, and L is the sequence length.
        """
        raise NotImplementedError("Subclasses must implement the forward method.")


class MelSpectrogramFeatures(FeatureExtractor):
    def __init__(self, sample_rate=24000, n_fft=1024, hop_length=256, n_mels=100, padding="center"):
        super().__init__()
        if padding not in ["center", "same"]:
            raise ValueError("Padding must be 'center' or 'same'.")
        self.padding = padding
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            center=padding == "center",
            power=1,
        )

    def forward(self, audio, **kwargs):
        if self.padding == "same":
            pad = self.mel_spec.win_length - self.mel_spec.hop_length
            audio = torch.nn.functional.pad(audio, (pad // 2, pad // 2), mode="reflect")
        mel = self.mel_spec(audio)
        features = safe_log(mel)
        return features

def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
    """
    Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.

    Args:
        x (Tensor): Input tensor.
        clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.

    Returns:
        Tensor: Element-wise logarithm of the input tensor with clipping applied.
    """
    return torch.log(torch.clip(x, min=clip_val))

In [52]:
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(DEVICE)
feature_extractor = MelSpectrogramFeatures(
    sample_rate=24000,
    n_fft=1024,
    hop_length=256,
    n_mels=100,
    padding="center"
)

times = []
for file_path in tqdm(files_to_process):
    resample_transform = torchaudio.transforms.Resample(orig_freq=16000, new_freq=24000)
    waveform, sr = torchaudio.load(file_path)
    if sr != 24000:
        waveform = resample_transform(waveform)
    spectrogram = feature_extractor(waveform).to(DEVICE)

    start = time.time()
    
    waveforms = vocos.decode(spectrogram)

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists(f'{DATASET}/noised_generated/vocos'):
        os.makedirs(f'{DATASET}/noised_generated/vocos')
    torchaudio.save(f'{DATASET}/noised_generated\\vocos\\' + file_path[18:], waveforms.cpu().squeeze(1), 24000)

print('Średni czas przetwarzania:', np.mean(times))

100%|██████████| 744/744 [00:06<00:00, 107.23it/s]

Średni czas przetwarzania: 0.004448821788193077





### BigV-GAN

In [9]:
model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_24khz_100band_256x', use_cuda_kernel=False)
model.remove_weight_norm()
model = model.eval().to(DEVICE)

times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 100, sr=model.h.sampling_rate)

    start = time.time()
    
    with torch.inference_mode():
        waveforms = model(spectrogram.unsqueeze(0).to(DEVICE)).squeeze(0).cpu()

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists(f'{DATASET}/noised_generated/bigvgan'):
        os.makedirs(f'{DATASET}/noised_generated/bigvgan')
    torchaudio.save(f'{DATASET}/noised_generated\\bigvgan\\' + file_path[18:], waveforms.squeeze(1), model.h.sampling_rate)

print('Średni czas przetwarzania:', np.mean(times))

  WeightNorm.apply(module, name, dim)


Loading weights from nvidia/bigvgan_v2_24khz_100band_256x


  checkpoint_dict = torch.load(model_file, map_location=map_location)


Removing weight norm...


100%|██████████| 744/744 [02:18<00:00,  5.38it/s]

Średni czas przetwarzania: 0.18014444266596147





### DiffWave

In [13]:
diffwave = DiffWaveVocoder.from_hparams(source="speechbrain/tts-diffwave-ljspeech", savedir="pretrained_models/tts-diffwave-ljspeech")

times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 80)
    
    start = time.time()
    
    waveforms = diffwave.decode_batch(
        spectrogram,
        hop_len=256,
        fast_sampling=True,
        fast_sampling_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5],
    )

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists(f'{DATASET}/noised_generated/diffwave'):
        os.makedirs(f'{DATASET}/noised_generated/diffwave')
    torchaudio.save(f'{DATASET}/noised_generated\\diffwave\\' + file_path[18:], waveforms.squeeze(1), 22050)

print('Średni czas przetwarzania:', np.mean(times))

  state_dict = torch.load(path, map_location=device)
100%|██████████| 744/744 [1:15:42<00:00,  6.10s/it]

Średni czas przetwarzania: 6.098959868954074





### UnivNet

In [10]:
model_id_or_path = "dg845/univnet-dev"
model = UnivNetModel.from_pretrained(model_id_or_path)
feature_extractor = UnivNetFeatureExtractor.from_pretrained(model_id_or_path)

times = []
for file_path in tqdm(files_to_process):
    input_audio, sampling_rate = librosa.load(file_path, sr=feature_extractor.sampling_rate)
    inputs = feature_extractor(
        input_audio, 
        sampling_rate=sampling_rate, 
        pad_end=True, 
        return_tensors="pt"
    )

    start = time.time()
    
    with torch.no_grad():
        waveforms = model(**inputs)
    waveforms = torch.Tensor(feature_extractor.batch_decode(**waveforms))
    
    end = time.time()
    times.append(end-start)
    
    if not os.path.exists(f'{DATASET}/noised_generated/univnet/'):
        os.makedirs(f'{DATASET}/noised_generated/univnet/')
    torchaudio.save(f'{DATASET}/noised_generated\\univnet\\' + file_path[18:], waveforms.squeeze(1), feature_extractor.sampling_rate)

print('Średni czas przetwarzania:', np.mean(times))

  waveforms = torch.Tensor(feature_extractor.batch_decode(**waveforms))
100%|██████████| 744/744 [02:15<00:00,  5.51it/s]

Średni czas przetwarzania: 0.17294928591738465





### WaveGlow

In [12]:
waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32')
waveglow = waveglow.to(DEVICE)

times = []
for file_path in tqdm(files_to_process):
    y, sr = librosa.load(file_path, sr=22050, mono=True)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=256, win_length=1024, n_mels=80, fmin=0, fmax=8000)
    log_mel_spectrogram = np.log(spectrogram + 1e-6)
    mel_input = torch.tensor(log_mel_spectrogram).unsqueeze(0).float().to(DEVICE)

    start = time.time()
    
    with torch.no_grad():
        waveforms = waveglow.infer(mel_input)

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists(f'{DATASET}/noised_generated/waveglow'):
        os.makedirs(f'{DATASET}/noised_generated/waveglow')
    torchaudio.save(f'{DATASET}/noised_generated\\waveglow\\' + file_path[18:], waveforms.squeeze(1).cpu(), 22050)

print('Średni czas przetwarzania:', np.mean(times))

Using cache found in C:\Users\wikto/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub
  ckpt = torch.load(ckpt_file)
100%|██████████| 744/744 [06:59<00:00,  1.77it/s]

Średni czas przetwarzania: 0.22682068585067666





### Griffin-Lim

In [11]:
times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 80, 24000, compression=False)
    
    start = time.time()
    
    spectrogram = spectrogram.numpy()
    spectrogram = np.maximum(spectrogram, 1e-10)

    mel_basis = librosa.filters.mel(sr=24000, n_fft=1024, n_mels=spectrogram.shape[0], fmin=0, fmax=8000)
    inv_mel = np.linalg.pinv(mel_basis)
    linear_spectrogram = np.dot(inv_mel, spectrogram)

    linear_spectrogram = np.maximum(linear_spectrogram, 1e-10)
    linear_spectrogram = librosa.db_to_amplitude(librosa.power_to_db(linear_spectrogram))

    waveforms = librosa.griffinlim(
        linear_spectrogram, 
        hop_length=256, 
        win_length=1024, 
        n_iter=32
    )

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists(f'{DATASET}/noised_generated/griffinlim'):
        os.makedirs(f'{DATASET}/noised_generated/griffinlim')
    torchaudio.save(f'{DATASET}/noised_generated\\griffinlim\\' + file_path[18:], torch.Tensor(waveforms).unsqueeze(0), 24000)

print('Średni czas przetwarzania:', np.mean(times))

100%|██████████| 744/744 [02:08<00:00,  5.80it/s]

Średni czas przetwarzania: 0.16712379968294533





# Liczenie metryk

In [14]:
with open(f"{DATASET}/noises.txt", "r") as file:
    noise_list = [line.strip() for line in file]

In [15]:
noises = ['white', 'blue', 'violet', 'brownian', 'pink', 'impulse', 'frequency', 'clipping']

In [16]:
noise_indexes = {noise: [] for noise in noises}

for i, noise in enumerate(noise_list):
    noise_indexes[noise].append(i)

In [17]:
for noise, indexes in noise_indexes.items():
    print(f'{noise}: {len(indexes)}')

white: 84
blue: 89
violet: 88
brownian: 75
pink: 91
impulse: 97
frequency: 97
clipping: 123


In [18]:
len(noise_list)

744

In [19]:
from scripts.metrics import calculate_metrics_for_all_data

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
import numpy as np
import librosa
import torch
import torchaudio
from scipy.linalg import sqrtm
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.metrics.pairwise import rbf_kernel, polynomial_kernel
from pymcd.mcd import Calculate_MCD
from torch.nn.utils.rnn import pad_sequence
from torchmetrics.audio.pesq import PerceptualEvaluationSpeechQuality
from tqdm import tqdm
import random


processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
mcd_toolbox = Calculate_MCD(MCD_mode="dtw")
pesq = PerceptualEvaluationSpeechQuality(16000, 'wb')


def load_audio(file_path):
    """Wczytuje plik WAV i resampluje do target_sr."""
    waveform, sr = torchaudio.load(file_path)
    waveform = waveform.mean(dim=0)  # Konwersja do mono
    return waveform


def calculate_single_sdr(src_wav, gen_wav):
    src_spec = librosa.stft(src_wav.cpu().numpy())
    gen_spec = librosa.stft(gen_wav.cpu().numpy())

    src_log = librosa.amplitude_to_db(np.abs(src_spec), ref=np.max)
    gen_log = librosa.amplitude_to_db(np.abs(gen_spec), ref=np.max)

    min_len = min(src_log.shape[1], gen_log.shape[1])
    src_log = src_log[:, :min_len]
    gen_log = gen_log[:, :min_len]

    noise = src_log - gen_log
    sdr = 10 * np.log10(np.sum(np.abs(src_log) ** 2) / np.sum(np.abs(noise) ** 2))
    return sdr


def calculate_sdr(src_wavs, gen_wavs):
    """
    all wavs should be in the same sample rate
    """
    sdrs = []
    for src_wav, gen_wav in zip(src_wavs, gen_wavs):
        sdrs.append(calculate_single_sdr(src_wav, gen_wav))
    return np.mean(sdrs)


def extract_embeddings(processor, model, waveforms):
    """Ekstrakcja embeddingów za pomocą pretrenowanego modelu Wav2Vec 2.0."""
    resampler = torchaudio.transforms.Resample(48000, 16000)
    inputs = processor(pad_sequence([resampler(wav) for wav in waveforms], batch_first=True), sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(input_values=inputs['input_values'].squeeze(0))
    # Średnia po czasie dla globalnych embeddingów
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()


def calculate_fad(embeddings1, embeddings2):
    """Oblicza Frechet Audio Distance (FAD) między dwoma zbiorami embeddingów."""
    embeddings1 = np.atleast_2d(embeddings1)
    embeddings2 = np.atleast_2d(embeddings2)

    mean1, mean2 = np.mean(embeddings1, axis=0), np.mean(embeddings2, axis=0)
    cov1, cov2 = np.cov(embeddings1, rowvar=False), np.cov(embeddings2, rowvar=False)
    
    if cov1.ndim == 0 or cov2.ndim == 0:  # Jeśli covariancje są skalarami
        cov1 = np.array([[cov1]])
        cov2 = np.array([[cov2]])
    
    cov_sqrt = sqrtm(cov1 @ cov2)
    if np.iscomplexobj(cov_sqrt):
        cov_sqrt = cov_sqrt.real
    fad = np.sum((mean1 - mean2) ** 2) + np.trace(cov1 + cov2 - 2 * cov_sqrt)
    return fad


def calculate_kid(embeddings1, embeddings2, kernel='rbf', gamma=None):
    """Oblicza Kernel Inception Distance (KID) między dwoma zbiorami embeddingów."""
    
    def kernel_matrix(X, Y=None):
        if kernel == 'rbf':
            return rbf_kernel(X, Y, gamma=gamma)
        elif kernel == 'polynomial':
            return polynomial_kernel(X, Y, degree=3, coef0=1)
        else:
            raise ValueError(f"Nieobsługiwany kernel: {kernel}")
    
    K_XX = kernel_matrix(embeddings1)
    K_YY = kernel_matrix(embeddings2)
    K_XY = kernel_matrix(embeddings1, embeddings2)
    
    m, n = embeddings1.shape[0], embeddings2.shape[0]
    kid = (np.sum(K_XX) / (m * m) + np.sum(K_YY) / (n * n) - 2 * np.sum(K_XY) / (m * n))
    return kid


def calculate_mcd(src_files, gen_files):
    results = []
    for src, gen in zip(src_files, gen_files):
        results.append(mcd_toolbox.calculate_mcd(src, gen))
    return np.mean(results)


def calculate_pesq(src_wavs, gen_wavs):
    results = []
    for src, gen in zip(src_wavs, gen_wavs):
        min_len = min(len(src), len(gen))
        src = src[:min_len]
        gen = gen[:min_len]
        results.append(pesq(src, gen))
    return np.mean(results)


def calculate_metrics(src_wavs, gen_wavs, src_embeddings=None, gen_embeddings=None):
    '''
    Jeżeli zamiast ścieżki do wczytania dostarczane są wcześniej wczytane wav, powinny mieć sr=48000
    '''
    results = {}

    src_wavs_str, gen_wavs_str = [], []
    for src, gen in zip(src_wavs, gen_wavs):
        if isinstance(src, str) and isinstance(gen, str):
            src_wavs_str.append(src)
            gen_wavs_str.append(gen)
    if src_wavs_str:
        results['mcd'] = calculate_mcd(src_wavs_str, gen_wavs_str)

    src_wavs = [load_audio(wav) if isinstance(wav, str) else wav.torch() if isinstance(wav, np.ndarray) else wav for wav in src_wavs]
    gen_wavs = [load_audio(wav) if isinstance(wav, str) else wav.torch() if isinstance(wav, np.ndarray) else wav for wav in gen_wavs]

    if src_embeddings is None:
        src_embeddings = extract_embeddings(processor, model, pad_sequence(src_wavs, batch_first=True))
    if gen_embeddings is None:
        gen_embeddings = extract_embeddings(processor, model, pad_sequence(gen_wavs, batch_first=True))

    results['pesq'] = calculate_pesq(src_wavs, gen_wavs)
    results['sdr'] = calculate_sdr(src_wavs, gen_wavs)
    results['fad'] = calculate_fad(src_embeddings, gen_embeddings)
    results['kid'] = calculate_kid(src_embeddings, gen_embeddings)

    return results


def calculate_metrics_for_all_data(src_paths, gen_paths_list, model_names, n_split=10):
    '''
    funkcja zoptymalizowana pod wyliczanie wszystkich metryk na raz dla każdego modelu
    '''
    results =  {model_name: {'sdr': [], 'fad': [], 'kid': [], 'mcd': [], 'pesq': []} for model_name in model_names}

    random.seed(2137)
    indexes = list(range(len(src_paths)))[:]
    random.shuffle(indexes)
    section_range = len(indexes) // n_split
    rest = len(indexes) % n_split
    sections = []
    start = 0
    for i in range(n_split):
        dodatkowy = 1 if i < rest else 0
        end = start + section_range + dodatkowy
        sections.append(set(indexes[start:end]))
        start = end

    for section in tqdm(sections):
        section_src_paths = [path for i, path in enumerate(src_paths) if i in section]
        src_wavs = [load_audio(path) for path in section_src_paths]
        src_log_specs = [librosa.amplitude_to_db(np.abs(librosa.stft(wav.cpu().numpy())), ref=np.max) for wav in src_wavs]
        src_embeddings = extract_embeddings(processor, model, src_wavs)
        src_mean = np.mean(src_embeddings, axis=0)
        src_cov = np.cov(src_embeddings, rowvar=False)
        src_K = rbf_kernel(src_embeddings)
        # src_wavs_int = [(wav.numpy() * 32767).astype(np.int16) for wav in src_wavs]

        for gen_paths, model_name in tqdm(zip(gen_paths_list, model_names)):
            result = results[model_name]
            section_gen_paths = [path for i, path in enumerate(gen_paths) if i in section]
            gen_wavs = [load_audio(path) for path in section_gen_paths]
            gen_log_specs = [librosa.amplitude_to_db(np.abs(librosa.stft(wav.cpu().numpy())), ref=np.max) for wav in gen_wavs]
            gen_embeddings = extract_embeddings(processor, model, gen_wavs)
            gen_mean = np.mean(gen_embeddings, axis=0)
            gen_cov = np.cov(gen_embeddings, rowvar=False)
            gen_K = rbf_kernel(gen_embeddings)
            # gen_wavs_int = [(wav.numpy() * 32767).astype(np.int16) for wav in gen_wavs]

            # SDR
            sdrs = []
            for src, gen in zip(src_log_specs, gen_log_specs):
                min_len = min(src.shape[1], gen.shape[1])
                src = src[:, :min_len]
                gen = gen[:, :min_len]
                noise = src - gen
                sdr = 10 * np.log10(np.sum(np.abs(src) ** 2) / np.sum(np.abs(noise) ** 2))
                sdrs.append(sdr)
            result['sdr'].append(np.mean(sdrs))

            # FAD
            cov_sqrt = sqrtm(src_cov @ gen_cov)
            if np.iscomplexobj(cov_sqrt):
                cov_sqrt = cov_sqrt.real
            fad = np.sum((src_mean - gen_mean) ** 2) + np.trace(src_cov + gen_cov - 2 * cov_sqrt)
            result['fad'].append(fad)

            # KID
            src_gen_K = rbf_kernel(src_embeddings, gen_embeddings)
            m, n = src_embeddings.shape[0], gen_embeddings.shape[0]
            kid = (np.sum(src_K) / (m * m) + np.sum(gen_K) / (n * n) - 2 * np.sum(src_gen_K) / (m * n))
            result['kid'].append(kid)

            # MCD
            mcds = []
            for src, gen in zip(section_src_paths, section_gen_paths):
                mcds.append(mcd_toolbox.calculate_mcd(src, gen))
            result['mcd'].append(np.mean(mcds))

            # pesq
            pesqs = []
            for src, gen in zip(src_wavs, gen_wavs):
                min_len = min(len(src), len(gen))
                src = src[:min_len]
                gen = gen[:min_len]
                try:
                    pesqs.append(pesq(src, gen))
                except:
                    pesqs.append(0)
            result['pesq'].append(np.mean(pesqs))

    for model_name in model_names:
        result = results[model_name]
        result['sdr'] = np.mean(result['sdr'])
        result['fad'] = np.mean(result['fad'])
        result['kid'] = np.mean(result['kid'])
        result['mcd'] = np.mean(result['mcd'])
        result['pesq'] = np.mean(result['pesq'])
    
    return results

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
models = ['bigvgan', 'diffwave', 'hifigan', 'univnet', 'vocos', 'waveglow', 'griffinlim']
src_paths = [str(file) for file in Path(f'{DATASET}/data').rglob('*') if file.is_file()]
gen_paths = [[str(file) for file in Path(f'{DATASET}/noised_generated_preprocessed/{model}').rglob('*') if file.is_file()] for model in models]

In [54]:
src_paths = {noise: [path for i, path in enumerate(src_paths) if i in noise_indexes[noise]] for noise in noises}
gen_paths = {noise: [[path for i, path in enumerate(model_paths) if i in noise_indexes[noise]] for model_paths in gen_paths] for noise in noises}

# Crema-d

In [55]:
metrics = {}

In [56]:
noise = 'white'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

7it [02:44, 23.50s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [02:49<00:00, 169.68s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,14.710278,15.700282,14.463532,15.78193,16.030545,11.014895,6.345046
fad,0.298864,0.382149,0.46952,0.270859,0.274996,0.32102,1.038096
kid,0.000161,0.000247,0.000512,7.4e-05,7.3e-05,0.000122,0.001012
mcd,3.546919,3.403494,2.973307,2.925608,2.865903,4.851431,9.633583
pesq,1.164755,1.863689,2.056666,2.103376,2.809994,1.51282,1.378965


In [57]:
noise = 'blue'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

7it [02:54, 24.97s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [03:00<00:00, 180.10s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,14.779282,15.677527,14.530092,15.813178,16.116681,10.992123,6.319459
fad,0.390751,0.378301,0.493203,0.453488,0.264494,0.381147,1.305695
kid,0.000294,0.000173,0.000546,0.000344,0.000125,0.000123,0.001636
mcd,3.689097,3.413738,3.05341,3.027095,2.90168,4.991428,9.921579
pesq,1.157833,1.883429,2.118692,2.117866,2.855734,1.504514,1.39049


In [58]:
noise = 'violet'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

7it [03:13, 27.67s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [03:20<00:00, 200.94s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,15.348896,16.197726,14.922936,16.411562,16.730936,11.777324,6.441457
fad,0.43105,0.551304,1.081836,0.731261,0.302915,0.464991,1.639657
kid,0.000218,0.000448,0.001647,0.000773,5.6e-05,0.000204,0.002629
mcd,3.94999,3.461143,3.154618,3.077798,2.99402,5.304953,10.919041
pesq,1.149559,1.976402,2.221533,2.248818,2.996748,1.532104,1.404407


In [59]:
noise = 'brownian'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

7it [02:33, 21.90s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [02:38<00:00, 158.12s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,15.047985,16.111345,15.55711,16.927159,17.12738,11.545086,7.020418
fad,0.605003,0.305447,0.456892,0.684614,0.638682,0.395658,1.089501
kid,0.000484,9.3e-05,0.000444,0.000727,0.000729,0.000173,0.001121
mcd,3.745123,3.431261,3.281733,2.995698,2.901706,4.717051,10.632577
pesq,1.147314,1.940117,2.123114,2.247753,2.925152,1.527555,1.371748


In [60]:
noise = 'pink'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

7it [03:03, 26.28s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [03:09<00:00, 189.66s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,15.237957,15.744464,15.176133,16.908812,17.189197,10.951904,6.732994
fad,0.439548,0.491,0.430547,0.303781,0.274234,0.393731,1.081984
kid,0.00035,0.000231,0.000308,9.7e-05,0.000163,0.000192,0.001367
mcd,3.621054,3.308848,3.015154,2.948006,2.8372,4.684987,9.492137
pesq,1.165782,1.839339,2.109602,2.179093,2.873146,1.504009,1.393654


In [61]:
noise = 'impulse'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

7it [03:09, 27.00s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [03:15<00:00, 195.77s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,15.314609,15.688169,15.168771,17.003338,17.343099,11.06395,6.969339
fad,0.337413,0.515877,0.968683,0.400702,0.32522,0.451884,1.610599
kid,0.000122,0.000598,0.001675,0.00031,0.00018,0.00012,0.002378
mcd,3.783154,3.46547,3.161218,3.124068,3.000512,4.991116,9.650809
pesq,1.151384,1.790493,2.012572,2.068172,2.740159,1.491962,1.437746


In [62]:
noise = 'frequency'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

7it [03:20, 28.67s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [03:26<00:00, 206.94s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,15.461406,16.166354,15.444771,17.395754,17.715236,11.677875,6.9435
fad,0.306898,0.47414,0.695099,0.342645,0.344235,0.406012,1.096858
kid,0.000111,0.000533,0.00108,0.000168,0.000264,0.000119,0.00116
mcd,3.878486,3.522528,3.118144,3.106722,3.012748,5.259798,11.003176
pesq,1.153153,1.920978,2.197336,2.290078,2.99289,1.521322,1.440297


In [63]:
noise = 'clipping'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

7it [03:59, 34.25s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [04:07<00:00, 247.73s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,14.086351,16.232087,13.953372,15.369884,15.375968,12.132042,6.272971
fad,0.377875,0.450205,0.486409,0.341774,0.364073,0.303416,1.204356
kid,0.000348,0.000415,0.000622,0.000204,0.000352,0.000145,0.001842
mcd,4.885911,5.425338,5.010144,5.412266,4.732115,5.907625,6.143158
pesq,1.139771,1.557963,1.676961,1.754187,2.093227,1.436994,1.293112


In [64]:
from collections import defaultdict

weights = [len(indexes) for indexes in noise_indexes.values()]
weights = [x / np.sum(weights) for x in weights]

metrics_combined = defaultdict(lambda: defaultdict(float))

for metric_dict, weight in zip(metrics.values(), weights):
    for key, val in metric_dict.items():
        for val_key, val_val in val.items():
            metrics_combined[key][val_key] += val_val * weight

pd.DataFrame(metrics_combined)

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,14.966223,15.950344,14.858767,16.414945,16.658529,11.425615,6.616423
fad,0.392694,0.447958,0.636019,0.430909,0.343912,0.386891,1.261598
kid,0.000259,0.000355,0.000862,0.000322,0.000239,0.000149,0.001667
mcd,3.936043,3.760617,3.420662,3.426142,3.230657,5.133982,9.507244
pesq,1.15311,1.831491,2.046577,2.10815,2.75313,1.500368,1.385889


# ravdes

In [16]:
metrics = {}

In [19]:
noise = 'white'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [06:47, 67.86s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [06:58<00:00, 418.74s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,12.666799,12.764529,12.134394,13.423373,13.532763,14.9216
fad,0.229492,0.417543,0.415012,0.289746,0.227734,0.523781
kid,0.000197,0.000534,0.000506,0.000262,0.000176,0.000711
mcd,1.087203,1.359759,1.018927,0.862296,0.831931,1.654153
pesq,1.224608,1.105884,1.634995,1.620241,1.770851,1.416104


In [20]:
noise = 'blue'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [07:55, 79.25s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [08:10<00:00, 490.05s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,14.527699,12.97124,13.584446,15.493808,15.631273,14.676639
fad,0.175926,0.355196,0.286329,0.258766,0.168246,0.515507
kid,0.000112,0.000473,0.000326,0.000262,0.00011,0.000799
mcd,1.088675,1.335598,1.058553,0.940282,0.871237,1.635619
pesq,1.343012,1.123424,2.066953,2.018098,2.246245,1.398659


In [21]:
noise = 'violet'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [07:10, 71.78s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [07:24<00:00, 444.00s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,16.437715,13.08299,15.113727,17.940753,17.990759,14.777986
fad,0.151171,0.321684,0.2359,0.181847,0.12185,0.454863
kid,9e-05,0.000395,0.000238,0.000149,4.6e-05,0.00058
mcd,1.130001,1.360166,1.07167,0.960302,0.897904,1.718502
pesq,1.47613,1.123129,2.483549,2.423101,2.742554,1.391993


In [22]:
noise = 'brownian'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [06:15, 62.57s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [06:27<00:00, 387.52s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,15.702746,12.798841,13.724134,16.810352,16.93017,14.791112
fad,0.518765,0.363401,0.561601,0.463774,0.411559,0.407946
kid,0.000694,0.000442,0.000806,0.000629,0.000399,0.00044
mcd,1.497861,1.62255,1.759005,1.394798,1.231075,1.774109
pesq,1.218747,1.121741,1.766301,1.755063,1.955735,1.570882


In [23]:
noise = 'pink'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [07:11, 71.92s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [07:26<00:00, 446.00s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,13.064777,12.7775,12.773971,14.1628,14.234732,14.848816
fad,0.343379,0.422832,0.609483,0.28438,0.270146,0.366062
kid,0.000365,0.000506,0.000827,0.00025,0.000227,0.000396
mcd,1.083383,1.328782,1.00678,0.8994,0.837484,1.570127
pesq,1.219697,1.113488,1.686013,1.679113,1.853789,1.493199


In [24]:
noise = 'impulse'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [07:33, 75.57s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [07:47<00:00, 467.91s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,17.392569,12.493594,16.152797,19.729815,20.362212,14.794832
fad,0.135557,0.390498,0.177979,0.134508,0.11071,0.362281
kid,5e-05,0.000553,0.000134,4.7e-05,4e-05,0.000452
mcd,1.083844,1.474574,1.03529,0.925455,0.817709,1.764559
pesq,1.424931,1.114139,2.411099,2.364512,2.849087,1.432012


In [25]:
noise = 'frequency'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [07:28, 74.77s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [07:44<00:00, 464.18s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,16.950001,12.382161,16.2897,19.826035,20.370008,14.941112
fad,0.22494,0.647163,0.412036,0.35203,0.17334,0.310488
kid,0.000198,0.000856,0.000515,0.000462,0.00011,0.000327
mcd,1.152487,1.451093,1.062587,0.980384,0.853966,1.667443
pesq,1.438661,1.109134,2.669058,2.500183,2.973989,1.425568


In [26]:
noise = 'clipping'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [07:28, 74.73s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [07:42<00:00, 462.60s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,17.088828,11.210489,15.831983,18.960994,19.305565,14.866349
fad,0.177343,0.449454,0.217992,0.156165,0.123663,0.349664
kid,0.000153,0.000617,0.000224,0.000111,4.5e-05,0.000451
mcd,1.956818,2.26103,2.00323,2.029122,1.915847,2.245823
pesq,1.40838,1.092401,2.559392,2.307099,2.596597,1.303842


In [70]:
metrics2 = {}

In [71]:
noise = 'white'
metrics2[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics2[noise])

1it [01:24, 84.35s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [01:39<00:00, 99.04s/it]


Unnamed: 0,griffinlim
sdr,5.372539
fad,0.650177
kid,0.001001
mcd,4.559053
pesq,1.075698


In [72]:
noise = 'blue'
metrics2[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics2[noise])

1it [01:40, 100.62s/it]0:00<?, ?it/s]
100%|██████████| 1/1 [01:57<00:00, 117.61s/it]


Unnamed: 0,griffinlim
sdr,5.890288
fad,0.469884
kid,0.000733
mcd,4.009059
pesq,1.106088


In [73]:
noise = 'violet'
metrics2[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics2[noise])

1it [01:14, 74.17s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [01:28<00:00, 88.13s/it]


Unnamed: 0,griffinlim
sdr,6.418331
fad,0.364467
kid,0.000465
mcd,3.91552
pesq,1.155186


In [74]:
noise = 'brownian'
metrics2[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics2[noise])

1it [01:03, 63.67s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [01:16<00:00, 76.78s/it]


Unnamed: 0,griffinlim
sdr,6.414024
fad,1.889185
kid,0.002992
mcd,7.494676
pesq,1.066047


In [75]:
noise = 'pink'
metrics2[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics2[noise])

1it [01:12, 72.06s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [01:27<00:00, 87.56s/it]


Unnamed: 0,griffinlim
sdr,5.591129
fad,0.852751
kid,0.001186
mcd,5.40517
pesq,1.065832


In [76]:
noise = 'impulse'
metrics2[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics2[noise])

1it [01:17, 77.62s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [01:31<00:00, 91.92s/it]


Unnamed: 0,griffinlim
sdr,7.70487
fad,0.276627
kid,0.000295
mcd,3.706451
pesq,1.281522


In [77]:
noise = 'frequency'
metrics2[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics2[noise])

1it [01:17, 77.85s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [01:33<00:00, 93.33s/it]


Unnamed: 0,griffinlim
sdr,7.703996
fad,0.398066
kid,0.000463
mcd,4.048707
pesq,1.186681


In [78]:
noise = 'clipping'
metrics2[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics2[noise])

1it [01:15, 75.58s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [01:29<00:00, 89.25s/it]


Unnamed: 0,griffinlim
sdr,7.751448
fad,0.377069
kid,0.000579
mcd,2.517888
pesq,1.245013


In [54]:
from collections import defaultdict

weights = [len(indexes) for indexes in noise_indexes.values()]
weights = [x / np.sum(weights) for x in weights]

metrics_combined = defaultdict(lambda: defaultdict(float))

for metric_dict, weight in zip(metrics.values(), weights):
    for key, val in metric_dict.items():
        for val_key, val_val in val.items():
            metrics_combined[key][val_key] += val_val * weight

pd.DataFrame(metrics_combined)

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,15.513368,12.549481,14.493248,17.094889,17.352395,14.825854
fad,0.239111,0.421804,0.358607,0.261009,0.196443,0.410645
kid,0.000223,0.000549,0.000437,0.000265,0.000139,0.000521
mcd,1.259295,1.526445,1.247841,1.124447,1.032927,1.756023
pesq,1.347837,1.112815,2.172971,2.094533,2.388186,1.426174


In [79]:
from collections import defaultdict

weights = [len(indexes) for indexes in noise_indexes.values()]
weights = [x / np.sum(weights) for x in weights]

metrics_combined = defaultdict(lambda: defaultdict(float))

for metric_dict, weight in zip(metrics2.values(), weights):
    for key, val in metric_dict.items():
        for val_key, val_val in val.items():
            metrics_combined[key][val_key] += val_val * weight

pd.DataFrame(metrics_combined)

Unnamed: 0,griffinlim
sdr,6.62869
fad,0.636558
kid,0.000927
mcd,4.392737
pesq,1.150655


# old

In [80]:
noise = 'white'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

  0%|          | 0/1 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [48]:
noise = 'blue'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [10:29, 104.91s/it]0:00<?, ?it/s]
100%|██████████| 1/1 [11:03<00:00, 663.08s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,14.456181,15.503722,13.4996,15.409393,12.348787,14.619673
fad,0.537625,1.819387,1.513435,0.574283,0.482688,1.967888
kid,0.000651,0.00346,0.00303,0.000757,0.000403,0.003539
mcd,1.13251,1.155999,1.027429,0.886433,1.186398,1.841959
pesq,1.350388,1.790651,2.177208,2.166097,1.143143,1.637768


In [49]:
noise = 'violet'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [09:32, 95.49s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [10:02<00:00, 602.44s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,16.376031,15.616842,15.045745,17.843352,13.414205,14.729125
fad,0.45801,1.553583,1.022472,0.514811,0.459869,1.816452
kid,0.000448,0.002749,0.001859,0.000555,0.0003,0.003132
mcd,1.176862,1.158742,1.034903,0.903476,1.224144,1.918358
pesq,1.411807,1.868,2.467795,2.431051,1.169786,1.67689


In [52]:
noise = 'brownian'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [08:07, 81.27s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [08:34<00:00, 514.04s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,15.670327,15.148358,13.683388,16.760787,13.685299,14.740176
fad,0.761005,1.818211,1.423664,1.001515,0.582892,1.279015
kid,0.00098,0.003415,0.002458,0.001376,0.000431,0.001955
mcd,1.628801,1.209577,1.381834,1.303839,1.476189,1.913904
pesq,1.288763,1.863187,1.988722,2.024979,1.161602,1.745651


In [53]:
noise = 'pink'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [10:03, 100.61s/it]0:00<?, ?it/s]
100%|██████████| 1/1 [10:37<00:00, 637.31s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,13.031595,14.952741,12.726209,14.126584,11.999052,14.791411
fad,0.687526,2.10842,1.690639,0.655152,0.63207,1.265121
kid,0.000829,0.004023,0.003284,0.00087,0.000594,0.001896
mcd,1.122418,1.14486,0.972299,0.851637,1.164861,1.742019
pesq,1.31241,1.816816,2.068492,2.095142,1.150338,1.733092


In [54]:
noise = 'impulse'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [10:08, 101.42s/it]0:00<?, ?it/s]
100%|██████████| 1/1 [10:43<00:00, 643.74s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,17.360322,14.734819,16.09251,19.676852,14.383334,14.740222
fad,0.378871,1.361215,0.522413,0.372376,0.289804,1.458787
kid,0.000383,0.002411,0.000684,0.000439,0.00017,0.002429
mcd,1.131264,1.267158,1.001441,0.866644,1.152792,1.970012
pesq,1.375501,1.846398,2.568631,2.516571,1.153271,1.654429


In [56]:
noise = 'frequency'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [10:03, 100.61s/it]0:00<?, ?it/s]
100%|██████████| 1/1 [10:38<00:00, 638.09s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,16.912408,14.674378,16.22338,19.76108,14.363568,14.891606
fad,0.453427,1.871612,0.840862,0.342586,0.499885,1.655477
kid,0.000347,0.00346,0.001207,0.000236,0.000316,0.002903
mcd,1.180524,1.234167,1.035792,0.911474,1.183596,1.865474
pesq,1.387142,1.839585,2.626029,2.615797,1.166429,1.645079


In [55]:
noise = 'clipping'
metrics[noise] = calculate_metrics_for_all_data(src_paths[noise], gen_paths[noise], models, n_split=1)
pd.DataFrame(metrics[noise])

6it [09:47, 97.97s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [10:18<00:00, 618.82s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,17.019349,12.57884,15.758585,18.894605,14.022302,14.813301
fad,0.574112,2.101117,0.598597,0.426622,0.364716,2.622468
kid,0.000704,0.003957,0.00064,0.000393,0.000237,0.004742
mcd,1.288319,1.583418,1.300928,1.252822,1.346874,1.891013
pesq,1.302638,1.488502,1.782564,1.778797,1.143865,1.583204


In [64]:
no_noise_metrics = {
    "bigvgan": {
        "sdr": 18.742272,
        "fad": 0.346752,
        "kid": 0.000165,
        "mcd": 1.088343,
        "pesq": 1.483380,
    },
    "diffwave": {
        "sdr": 14.422735,
        "fad": 1.728151,
        "kid": 0.003033,
        "mcd": 1.202381,
        "pesq": 1.841032,
    },
    "hifigan": {
        "sdr": 17.105282,
        "fad": 0.377490,
        "kid": 0.000230,
        "mcd": 0.951637,
        "pesq": 2.991883,
    },
    "univnet": {
        "sdr": 22.180890,
        "fad": 0.273113,
        "kid": 0.000093,
        "mcd": 0.834505,
        "pesq": 2.977830,
    },
    "vocos": {
        "sdr": 14.784264,
        "fad": 0.304175,
        "kid": 0.000112,
        "mcd": 1.099907,
        "pesq": 1.184869,
    },
    "waveglow": {
        "sdr": 14.769862,
        "fad": 1.863754,
        "kid": 0.003188,
        "mcd": 1.867460,
        "pesq": 1.664157,
    },
}


In [65]:
def get_metrics_decrease(dict1, dict2):
    result = {}

    # Iteracja przez klucze główne (modele)
    for key in dict1:
        result[key] = {}
        # Iteracja przez metryki
        for sub_key in dict1[key]:
            # Iloraz odpowiednich wartości
            if dict2[key][sub_key] != 0:  # Zapobieganie dzieleniu przez 0
                result[key][sub_key] = dict1[key][sub_key] / dict2[key][sub_key]
            else:
                result[key][sub_key] = None  # Wartość None dla dzielenia przez 0

    return result


In [66]:
pd.DataFrame(get_metrics_decrease(metrics['white'], no_noise_metrics))

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,0.672457,1.029282,0.704872,0.601974,0.778115,1.006754
fad,2.241614,1.185899,4.468542,2.675995,2.499696,0.779367
kid,5.939394,1.297725,13.878261,10.354839,7.857143,0.733689
mcd,1.02619,0.957854,1.022462,0.973238,1.08216,0.976328
pesq,0.889152,0.974742,0.651838,0.656045,0.962523,1.044989


In [67]:
pd.DataFrame(get_metrics_decrease(metrics['blue'], no_noise_metrics))

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,0.771314,1.07495,0.789207,0.694715,0.835266,0.989831
fad,1.55046,1.052794,4.009206,2.10273,1.586876,1.055873
kid,3.945455,1.140785,13.173913,8.139785,3.598214,1.1101
mcd,1.040582,0.961425,1.079644,1.062226,1.078635,0.986345
pesq,0.910345,0.972634,0.727705,0.727408,0.964784,0.984143


In [68]:
pd.DataFrame(get_metrics_decrease(metrics['violet'], no_noise_metrics))

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,0.873748,1.082793,0.879596,0.804447,0.90733,0.997242
fad,1.320858,0.898986,2.708607,1.884974,1.511857,0.97462
kid,2.715152,0.906363,8.082609,5.967742,2.678571,0.982434
mcd,1.081334,0.963706,1.087498,1.082649,1.112952,1.027255
pesq,0.95175,1.014648,0.82483,0.816383,0.98727,1.007651


In [69]:
pd.DataFrame(get_metrics_decrease(metrics['brownian'], no_noise_metrics))

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,0.836095,1.050311,0.799951,0.755641,0.925667,0.99799
fad,2.194666,1.052114,3.771396,3.667036,1.916304,0.686257
kid,5.941218,1.126021,10.684801,14.801025,3.849438,0.613367
mcd,1.496588,1.005985,1.45206,1.56241,1.342103,1.02487
pesq,0.868802,1.012034,0.664706,0.680018,0.980363,1.04897


In [70]:
pd.DataFrame(get_metrics_decrease(metrics['pink'], no_noise_metrics))

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,0.695305,1.036748,0.743993,0.636881,0.81161,1.001459
fad,1.982761,1.220044,4.478632,2.398831,2.077982,0.678803
kid,5.025658,1.326309,14.277261,9.351312,5.305778,0.594589
mcd,1.031309,0.952161,1.021712,1.020529,1.059054,0.932828
pesq,0.884743,0.986846,0.691368,0.70358,0.970857,1.041424


In [73]:
pd.DataFrame(get_metrics_decrease(metrics['impulse'], no_noise_metrics))

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,0.926266,1.021638,0.940792,0.887108,0.972881,0.997993
fad,1.092629,0.787671,1.383912,1.363451,0.952754,0.782714
kid,2.319551,0.794833,2.973793,4.725012,1.514889,0.761789
mcd,1.039437,1.053874,1.052335,1.038512,1.048082,1.054915
pesq,0.927275,1.002915,0.858533,0.845102,0.973332,0.994154


In [72]:
pd.DataFrame(get_metrics_decrease(metrics['frequency'], no_noise_metrics))

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,0.902367,1.017448,0.948443,0.890906,0.971544,1.008243
fad,1.307641,1.083014,2.227507,1.254373,1.643412,0.888249
kid,2.105225,1.140892,5.246389,2.539802,2.825463,0.910494
mcd,1.084699,1.026436,1.088432,1.092233,1.076088,0.998936
pesq,0.935123,0.999214,0.877718,0.878424,0.984437,0.988536


In [71]:
pd.DataFrame(get_metrics_decrease(metrics['clipping'], no_noise_metrics))

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,0.908073,0.872154,0.92127,0.851842,0.948461,1.002941
fad,1.655684,1.215818,1.585728,1.562071,1.199034,1.407089
kid,4.266776,1.304642,2.784165,4.225938,2.115685,1.487386
mcd,1.183743,1.316902,1.367042,1.501276,1.224535,1.012612
pesq,0.878155,0.808515,0.5958,0.597347,0.965394,0.951355
