In [1]:
import torch
import torchaudio
import bigvgan
import librosa
import soundfile as sf
from vocos import Vocos
import speechbrain as sb
from speechbrain.utils.fetching import fetch
from speechbrain.utils.data_utils import split_path
from speechbrain.lobes.models.FastSpeech2 import mel_spectogram
from speechbrain.inference.vocoders import HIFIGAN
from speechbrain.inference.vocoders import DiffWaveVocoder
from transformers import UnivNetFeatureExtractor, UnivNetModel
import librosa
import librosa.display
import numpy as np
from pathlib import Path
from scipy.io.wavfile import write
import IPython.display as ipd
from tqdm import tqdm
import os
import time
from torch import nn

  if ismodule(module) and hasattr(module, '__file__'):


In [2]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Przetworzenie plików za pomocą modeli

In [11]:
def get_mel_from_file(file_path, n_mels, sr=22050):
    signal, rate = librosa.load(file_path, sr=sr, mono=True)
    signal = torch.FloatTensor(signal)
    spectrogram, _ = mel_spectogram(
        audio=signal,
        sample_rate=22050,
        hop_length=256,
        win_length=1024,
        n_mels=n_mels,
        n_fft=1024,
        f_min=0.0,
        f_max=8000.0,
        power=1,
        normalized=False,
        min_max_energy_norm=True,
        norm="slaney",
        mel_scale="slaney",
        compression=True
    )

    return spectrogram

In [14]:
files_to_process = [str(file) for file in Path('data').rglob('*') if file.is_file()]

In [15]:
len(files_to_process)

1439

### HiFi-GAN

In [17]:
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-libritts-22050Hz", savedir="pretrained_models/tts-hifigan-libritts-22050Hz")

times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 80)
    
    start = time.time()
    waveforms = hifi_gan.decode_batch(spectrogram)
    end = time.time()
    times.append(end-start)

    if not os.path.exists('generated/hifigan/' + file_path[5:13]):
        os.makedirs('generated/hifigan/' + file_path[5:13])
    torchaudio.save('generated\\hifigan\\' + file_path[5:], waveforms.squeeze(1), 22050)
print('Średni czas przetwarzania:', np.mean(times))

  WeightNorm.apply(module, name, dim)
  state_dict = torch.load(path, map_location=device)
100%|██████████| 1439/1439 [19:17<00:00,  1.24it/s]

Średni czas przetwarzania: 0.7942535048810204





### Vocos

In [66]:
class FeatureExtractor(nn.Module):
    """Base class for feature extractors."""

    def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor:
        """
        Extract features from the given audio.

        Args:
            audio (Tensor): Input audio waveform.

        Returns:
            Tensor: Extracted features of shape (B, C, L), where B is the batch size,
                    C denotes output features, and L is the sequence length.
        """
        raise NotImplementedError("Subclasses must implement the forward method.")


class MelSpectrogramFeatures(FeatureExtractor):
    def __init__(self, sample_rate=24000, n_fft=1024, hop_length=256, n_mels=100, padding="center"):
        super().__init__()
        if padding not in ["center", "same"]:
            raise ValueError("Padding must be 'center' or 'same'.")
        self.padding = padding
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            center=padding == "center",
            power=1,
        )

    def forward(self, audio, **kwargs):
        if self.padding == "same":
            pad = self.mel_spec.win_length - self.mel_spec.hop_length
            audio = torch.nn.functional.pad(audio, (pad // 2, pad // 2), mode="reflect")
        mel = self.mel_spec(audio)
        features = safe_log(mel)
        return features

def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
    """
    Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.

    Args:
        x (Tensor): Input tensor.
        clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.

    Returns:
        Tensor: Element-wise logarithm of the input tensor with clipping applied.
    """
    return torch.log(torch.clip(x, min=clip_val))

In [69]:
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(DEVICE)
feature_extractor = MelSpectrogramFeatures(
    sample_rate=24000,
    n_fft=1024,
    hop_length=256,
    n_mels=100,
    padding="center"
)

times = []
for file_path in tqdm(files_to_process):
    resample_transform = torchaudio.transforms.Resample(orig_freq=48000, new_freq=24000)
    waveform, sr = torchaudio.load(file_path)
    if sr != 24000:
        waveform = resample_transform(waveform)
    spectrogram = feature_extractor(waveform).to(DEVICE)

    start = time.time()
    
    waveforms = vocos.decode(spectrogram)

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/vocos/' + file_path[5:13]):
        os.makedirs('generated/vocos/' + file_path[5:13])
    torchaudio.save('generated\\vocos\\' + file_path[5:], waveforms.cpu().squeeze(1), 22050)

print('Średni czas przetwarzania:', np.mean(times))

  state_dict = torch.load(model_path, map_location="cpu")
100%|██████████| 1439/1439 [01:51<00:00, 12.86it/s]

Średni czas przetwarzania: 0.07181362976540784





### BigV-GAN

In [17]:
model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_24khz_100band_256x', use_cuda_kernel=False)
model.remove_weight_norm()
model = model.eval().to(DEVICE)

times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 100, sr=model.h.sampling_rate)

    start = time.time()
    
    with torch.inference_mode():
        waveforms = model(spectrogram.unsqueeze(0).to(DEVICE)).squeeze(0).cpu()

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/bigvgan/' + file_path[5:13]):
        os.makedirs('generated/bigvgan/' + file_path[5:13])
    torchaudio.save('generated\\bigvgan\\' + file_path[5:], waveforms.squeeze(1), model.h.sampling_rate)

print('Średni czas przetwarzania:', np.mean(times))

Loading weights from nvidia/bigvgan_v2_24khz_100band_256x
Removing weight norm...


100%|██████████| 1439/1439 [06:19<00:00,  3.80it/s]

Średni czas przetwarzania: 0.2562854376163973





### DiffWave

In [61]:
device = DEVICE

In [64]:
diffwave = DiffWaveVocoder.from_hparams(source="speechbrain/tts-diffwave-ljspeech", savedir="pretrained_models/tts-diffwave-ljspeech")

times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 80)
    
    start = time.time()
    
    waveforms = diffwave.decode_batch(
        spectrogram,
        hop_len=256,
        fast_sampling=True,
        fast_sampling_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5],
    )

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/diffwave/' + file_path[5:13]):
        os.makedirs('generated/diffwave/' + file_path[5:13])
    torchaudio.save('generated\\diffwave\\' + file_path[5:], waveforms.squeeze(1), 22050)

print('Średni czas przetwarzania:', np.mean(times))

100%|██████████| 1439/1439 [3:12:38<00:00,  8.03s/it]  

Średni czas przetwarzania: 8.018279543015757





### UnivNet

In [44]:
model_id_or_path = "dg845/univnet-dev"
model = UnivNetModel.from_pretrained(model_id_or_path)
feature_extractor = UnivNetFeatureExtractor.from_pretrained(model_id_or_path)

times = []
for file_path in tqdm(files_to_process):
    input_audio, sampling_rate = librosa.load(file_path, sr=feature_extractor.sampling_rate)
    inputs = feature_extractor(
        input_audio, 
        sampling_rate=sampling_rate, 
        pad_end=True, 
        return_tensors="pt"
    )

    start = time.time()
    
    with torch.no_grad():
        waveforms = model(**inputs)
    waveforms = torch.Tensor(feature_extractor.batch_decode(**waveforms))
    
    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/univnet/' + file_path[5:13]):
        os.makedirs('generated/univnet/' + file_path[5:13])
    torchaudio.save('generated\\univnet\\' + file_path[5:], waveforms.squeeze(1), feature_extractor.sampling_rate)

print('Średni czas przetwarzania:', np.mean(times))

100%|██████████| 1439/1439 [06:52<00:00,  3.49it/s]

Średni czas przetwarzania: 0.25812796456189185





### WaveGlow

In [58]:
waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32')
waveglow = waveglow.to(DEVICE)

times = []
for file_path in tqdm(files_to_process):
    y, sr = librosa.load(file_path, sr=22050, mono=True)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=256, win_length=1024, n_mels=80, fmin=0, fmax=8000)
    log_mel_spectrogram = np.log(spectrogram + 1e-6)
    mel_input = torch.tensor(log_mel_spectrogram).unsqueeze(0).float().to(DEVICE)

    start = time.time()
    
    with torch.no_grad():
        waveforms = waveglow.infer(mel_input)

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/waveglow/' + file_path[5:13]):
        os.makedirs('generated/waveglow/' + file_path[5:13])
    torchaudio.save('generated\\waveglow\\' + file_path[5:], waveforms.squeeze(1).cpu(), 22050)

print('Średni czas przetwarzania:', np.mean(times))

Using cache found in C:\Users\wikto/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub
100%|██████████| 1439/1439 [3:26:19<00:00,  8.60s/it]  

Średni czas przetwarzania: 7.269831891354793





# Liczenie metryk

In [3]:
from scripts.metrics import calculate_metrics_for_all_data

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
models = ['bigvgan', 'diffwave', 'hifigan', 'univnet', 'vocos', 'waveglow']
src_paths = [str(file) for file in Path('data').rglob('*') if file.is_file()]
gen_paths = [[str(file) for file in Path(f'generated/{model}').rglob('*') if file.is_file()] for model in models]

In [5]:
metrics = calculate_metrics_for_all_data(src_paths, gen_paths, models)

6it [07:30, 75.02s/it][00:00<?, ?it/s]
6it [07:16, 72.67s/it][07:55<1:11:18, 475.42s/it]
6it [07:35, 75.83s/it][15:34<1:02:05, 465.68s/it]
6it [07:20, 73.42s/it][23:34<55:05, 472.26s/it]  
6it [07:38, 76.34s/it][31:18<46:54, 469.03s/it]
6it [07:26, 74.40s/it][39:22<39:31, 474.39s/it]
6it [07:23, 73.90s/it][47:12<31:32, 473.06s/it]
6it [07:47, 77.85s/it][55:00<23:34, 471.36s/it]
6it [07:44, 77.37s/it][1:03:13<15:56, 478.06s/it]
6it [07:32, 75.48s/it][1:11:23<08:01, 481.78s/it]
100%|██████████| 10/10 [1:19:20<00:00, 476.06s/it]


In [7]:
import pandas as pd
pd.DataFrame(metrics)

Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,18.742272,14.422735,17.105282,22.18089,14.784264,14.769862
fad,0.346752,1.728151,0.37749,0.273113,0.304175,1.863754
kid,0.000165,0.003033,0.00023,9.3e-05,0.000112,0.003188
mcd,1.088343,1.202381,0.951637,0.834505,1.099907,1.86746
pesq,1.48338,1.841032,2.991883,2.97783,1.184869,1.664157
