In [2]:
import torch
import torchaudio
import bigvgan
import librosa
import soundfile as sf
from vocos import Vocos
import speechbrain as sb
from speechbrain.utils.fetching import fetch
from speechbrain.utils.data_utils import split_path
from speechbrain.lobes.models.FastSpeech2 import mel_spectogram
from speechbrain.inference.vocoders import HIFIGAN
from speechbrain.inference.vocoders import DiffWaveVocoder
from transformers import UnivNetFeatureExtractor, UnivNetModel
import librosa
import librosa.display
import numpy as np
from pathlib import Path
from scipy.io.wavfile import write
import IPython.display as ipd
from tqdm import tqdm
import os
import time
from torch import nn
import pandas as pd

  if ismodule(module) and hasattr(module, '__file__'):


In [3]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Przetworzenie plików za pomocą modeli

In [10]:
def get_mel_from_file(file_path, n_mels, sr=22050, compression=True):
    signal, rate = librosa.load(file_path, sr=sr, mono=True)
    signal = torch.FloatTensor(signal)
    spectrogram, _ = mel_spectogram(
        audio=signal,
        sample_rate=rate,
        hop_length=256,
        win_length=1024,
        n_mels=n_mels,
        n_fft=1024,
        f_min=0.0,
        f_max=8000.0,
        power=1,
        normalized=False,
        min_max_energy_norm=True,
        norm="slaney",
        mel_scale="slaney",
        compression=compression
    )

    return spectrogram

In [6]:
files_to_process = [str(file) for file in Path('data').rglob('*') if file.is_file()]

In [15]:
len(files_to_process)

1439

### HiFi-GAN

In [24]:
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-libritts-22050Hz", savedir="pretrained_models/tts-hifigan-libritts-22050Hz")

times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 80)
    
    start = time.time()
    waveforms = hifi_gan.decode_batch(spectrogram)
    end = time.time()
    times.append(end-start)

    if not os.path.exists('generated/hifigan/' + file_path[5:13]):
        os.makedirs('generated/hifigan/' + file_path[5:13])
    torchaudio.save('generated\\hifigan\\' + file_path[5:], waveforms.squeeze(1), 22050)
print('Średni czas przetwarzania:', np.mean(times))

100%|██████████| 1439/1439 [18:48<00:00,  1.28it/s]

Średni czas przetwarzania: 0.7668960763813307





### Vocos

In [66]:
class FeatureExtractor(nn.Module):
    """Base class for feature extractors."""

    def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor:
        """
        Extract features from the given audio.

        Args:
            audio (Tensor): Input audio waveform.

        Returns:
            Tensor: Extracted features of shape (B, C, L), where B is the batch size,
                    C denotes output features, and L is the sequence length.
        """
        raise NotImplementedError("Subclasses must implement the forward method.")


class MelSpectrogramFeatures(FeatureExtractor):
    def __init__(self, sample_rate=24000, n_fft=1024, hop_length=256, n_mels=100, padding="center"):
        super().__init__()
        if padding not in ["center", "same"]:
            raise ValueError("Padding must be 'center' or 'same'.")
        self.padding = padding
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            center=padding == "center",
            power=1,
        )

    def forward(self, audio, **kwargs):
        if self.padding == "same":
            pad = self.mel_spec.win_length - self.mel_spec.hop_length
            audio = torch.nn.functional.pad(audio, (pad // 2, pad // 2), mode="reflect")
        mel = self.mel_spec(audio)
        features = safe_log(mel)
        return features

def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
    """
    Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.

    Args:
        x (Tensor): Input tensor.
        clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.

    Returns:
        Tensor: Element-wise logarithm of the input tensor with clipping applied.
    """
    return torch.log(torch.clip(x, min=clip_val))

In [69]:
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(DEVICE)
feature_extractor = MelSpectrogramFeatures(
    sample_rate=24000,
    n_fft=1024,
    hop_length=256,
    n_mels=100,
    padding="center"
)

times = []
for file_path in tqdm(files_to_process):
    resample_transform = torchaudio.transforms.Resample(orig_freq=48000, new_freq=24000)
    waveform, sr = torchaudio.load(file_path)
    if sr != 24000:
        waveform = resample_transform(waveform)
    spectrogram = feature_extractor(waveform).to(DEVICE)

    start = time.time()
    
    waveforms = vocos.decode(spectrogram)

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/vocos/' + file_path[5:13]):
        os.makedirs('generated/vocos/' + file_path[5:13])
    torchaudio.save('generated\\vocos\\' + file_path[5:], waveforms.cpu().squeeze(1), 22050)

print('Średni czas przetwarzania:', np.mean(times))

  state_dict = torch.load(model_path, map_location="cpu")
100%|██████████| 1439/1439 [01:51<00:00, 12.86it/s]

Średni czas przetwarzania: 0.07181362976540784





### BigV-GAN

In [35]:
model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_24khz_100band_256x', use_cuda_kernel=False)
model.remove_weight_norm()
model = model.eval().to(DEVICE)

times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 100, sr=model.h.sampling_rate)

    start = time.time()
    
    with torch.inference_mode():
        waveforms = model(spectrogram.unsqueeze(0).to(DEVICE)).squeeze(0).cpu()

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/bigvgan/' + file_path[5:13]):
        os.makedirs('generated/bigvgan/' + file_path[5:13])
    torchaudio.save('generated\\bigvgan\\' + file_path[5:], waveforms.squeeze(1), model.h.sampling_rate)

print('Średni czas przetwarzania:', np.mean(times))

Loading weights from nvidia/bigvgan_v2_24khz_100band_256x
Removing weight norm...


100%|██████████| 1439/1439 [07:59<00:00,  3.00it/s]

Średni czas przetwarzania: 0.3193237050860354





### DiffWave

In [64]:
diffwave = DiffWaveVocoder.from_hparams(source="speechbrain/tts-diffwave-ljspeech", savedir="pretrained_models/tts-diffwave-ljspeech")

times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 80)
    
    start = time.time()
    
    waveforms = diffwave.decode_batch(
        spectrogram,
        hop_len=256,
        fast_sampling=True,
        fast_sampling_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5],
    )

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/diffwave/' + file_path[5:13]):
        os.makedirs('generated/diffwave/' + file_path[5:13])
    torchaudio.save('generated\\diffwave\\' + file_path[5:], waveforms.squeeze(1), 22050)

print('Średni czas przetwarzania:', np.mean(times))

100%|██████████| 1439/1439 [3:12:38<00:00,  8.03s/it]  

Średni czas przetwarzania: 8.018279543015757





### UnivNet

In [44]:
model_id_or_path = "dg845/univnet-dev"
model = UnivNetModel.from_pretrained(model_id_or_path)
feature_extractor = UnivNetFeatureExtractor.from_pretrained(model_id_or_path)

times = []
for file_path in tqdm(files_to_process):
    input_audio, sampling_rate = librosa.load(file_path, sr=feature_extractor.sampling_rate)
    inputs = feature_extractor(
        input_audio, 
        sampling_rate=sampling_rate, 
        pad_end=True, 
        return_tensors="pt"
    )

    start = time.time()
    
    with torch.no_grad():
        waveforms = model(**inputs)
    waveforms = torch.Tensor(feature_extractor.batch_decode(**waveforms))
    
    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/univnet/' + file_path[5:13]):
        os.makedirs('generated/univnet/' + file_path[5:13])
    torchaudio.save('generated\\univnet\\' + file_path[5:], waveforms.squeeze(1), feature_extractor.sampling_rate)

print('Średni czas przetwarzania:', np.mean(times))

100%|██████████| 1439/1439 [06:52<00:00,  3.49it/s]

Średni czas przetwarzania: 0.25812796456189185





### WaveGlow

In [58]:
waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32')
waveglow = waveglow.to(DEVICE)

times = []
for file_path in tqdm(files_to_process):
    y, sr = librosa.load(file_path, sr=22050, mono=True)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=256, win_length=1024, n_mels=80, fmin=0, fmax=8000)
    log_mel_spectrogram = np.log(spectrogram + 1e-6)
    mel_input = torch.tensor(log_mel_spectrogram).unsqueeze(0).float().to(DEVICE)

    start = time.time()
    
    with torch.no_grad():
        waveforms = waveglow.infer(mel_input)

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/waveglow/' + file_path[5:13]):
        os.makedirs('generated/waveglow/' + file_path[5:13])
    torchaudio.save('generated\\waveglow\\' + file_path[5:], waveforms.squeeze(1).cpu(), 22050)

print('Średni czas przetwarzania:', np.mean(times))

Using cache found in C:\Users\wikto/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub
100%|██████████| 1439/1439 [3:26:19<00:00,  8.60s/it]  

Średni czas przetwarzania: 7.269831891354793





### Griffin-Lim

In [11]:
times = []
for file_path in tqdm(files_to_process):
    spectrogram = get_mel_from_file(file_path, 80, 24000, compression=False)
    
    start = time.time()
    
    spectrogram = spectrogram.numpy()
    spectrogram = np.maximum(spectrogram, 1e-10)

    mel_basis = librosa.filters.mel(sr=24000, n_fft=1024, n_mels=spectrogram.shape[0], fmin=0, fmax=8000)
    inv_mel = np.linalg.pinv(mel_basis)
    linear_spectrogram = np.dot(inv_mel, spectrogram)

    linear_spectrogram = np.maximum(linear_spectrogram, 1e-10)
    linear_spectrogram = librosa.db_to_amplitude(librosa.power_to_db(linear_spectrogram))

    waveforms = librosa.griffinlim(
        linear_spectrogram, 
        hop_length=256, 
        win_length=1024, 
        n_iter=32
    )

    end = time.time()
    times.append(end-start)
    
    if not os.path.exists('generated/griffinlim/' + file_path[5:13]):
        os.makedirs('generated/griffinlim/' + file_path[5:13])
    torchaudio.save('generated\\griffinlim\\' + file_path[5:], torch.Tensor(waveforms).unsqueeze(0), 24000)

print('Średni czas przetwarzania:', np.mean(times))

100%|██████████| 1439/1439 [05:45<00:00,  4.16it/s]

Średni czas przetwarzania: 0.2287171286290681





# Liczenie metryk

In [None]:
from scripts.metrics import calculate_metrics_for_all_data

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
models = ['bigvgan', 'diffwave', 'hifigan', 'univnet', 'vocos', 'waveglow', 'griffinlim'][-1:]
src_paths = [str(file) for file in Path('data_preprocessed').rglob('*') if file.is_file()]
gen_paths = [[str(file) for file in Path(f'generated_preprocessed/{model}').rglob('*') if file.is_file()] for model in models]

In [14]:
metrics_gl = calculate_metrics_for_all_data(src_paths, gen_paths, models, 10)
pd.DataFrame(metrics_gl)

1it [01:18, 78.95s/it][00:00<?, ?it/s]
1it [01:16, 76.37s/it][01:33<13:58, 93.18s/it]
1it [01:15, 75.53s/it][03:01<12:04, 90.60s/it]
1it [00:59, 59.72s/it][04:31<10:29, 89.91s/it]
1it [01:02, 62.47s/it][05:41<08:13, 82.24s/it]
1it [01:00, 60.71s/it][06:55<06:37, 79.43s/it]
1it [01:01, 61.88s/it][08:08<05:07, 76.91s/it]
1it [01:17, 77.58s/it][09:21<03:47, 75.70s/it]
1it [01:24, 84.85s/it][10:50<02:40, 80.17s/it]
1it [01:08, 68.72s/it][12:27<01:25, 85.20s/it]
100%|██████████| 10/10 [13:47<00:00, 82.76s/it]


Unnamed: 0,griffinlim
sdr,17.310499
fad,0.201146
kid,9.5e-05
mcd,1.134115
pesq,2.439989


In [39]:
metrics = calculate_metrics_for_all_data(src_paths, gen_paths, models, 10)
pd.DataFrame(metrics)

6it [06:48, 68.13s/it][00:00<?, ?it/s]
6it [06:35, 65.86s/it][07:02<1:03:22, 422.49s/it]
6it [06:48, 68.11s/it][13:49<55:07, 413.44s/it]  
6it [06:35, 65.99s/it][20:51<48:40, 417.18s/it]
6it [06:45, 67.64s/it][27:39<41:21, 413.54s/it]
6it [06:42, 67.09s/it][34:38<34:37, 415.58s/it]
6it [06:41, 66.89s/it][41:33<27:41, 415.43s/it]
6it [06:40, 66.77s/it][48:26<20:44, 414.78s/it]
6it [06:42, 67.08s/it][55:20<13:48, 414.32s/it]
6it [05:57, 59.58s/it][1:02:14<06:54, 414.42s/it]
100%|██████████| 10/10 [1:08:24<00:00, 410.42s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow
sdr,17.878102,12.2489,17.183513,22.273618,23.081748,14.820803
fad,0.202845,0.522125,0.232035,0.155997,0.120987,0.419708
kid,0.000111,0.000706,0.000154,4.8e-05,1.5e-05,0.000478
mcd,1.55418,1.399038,0.976735,0.886405,0.781028,1.674078
pesq,1.115202,1.109581,3.35906,3.126985,3.718057,1.415296


In [29]:
metrics = calculate_metrics_for_all_data(src_paths, gen_paths, models, 10)
pd.DataFrame(metrics)

7it [6:35:46, 3392.32s/it]00<?, ?it/s]
7it [06:51, 58.84s/it][6:35:57<59:23:39, 23757.68s/it]
7it [06:48, 58.32s/it][6:42:59<22:17:23, 10030.46s/it]
7it [06:14, 53.45s/it][6:49:58<10:58:12, 5641.77s/it] 
7it [06:23, 54.79s/it][6:56:20<5:56:33, 3565.51s/it] 
7it [06:43, 57.68s/it][7:02:53<3:21:46, 2421.35s/it]
7it [06:39, 57.07s/it][7:09:46<1:55:53, 1738.42s/it]
7it [06:33, 56.14s/it][7:16:35<1:05:11, 1303.74s/it]
7it [06:39, 57.04s/it][7:23:17<33:53, 1016.81s/it]  
7it [06:28, 55.44s/it][7:30:06<13:46, 826.65s/it] 
100%|██████████| 10/10 [7:36:43<00:00, 2740.32s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,18.789853,12.2489,17.183513,22.273618,23.081748,14.820803,23.223327
fad,0.168095,0.522125,0.232035,0.155997,0.120987,0.419708,0.13122
kid,6.7e-05,0.000706,0.000154,4.8e-05,1.5e-05,0.000478,2.2e-05
mcd,1.037949,1.399038,0.976735,0.886405,0.781028,1.674078,0.78936
pesq,1.497719,1.109581,3.35906,3.126985,3.718057,1.415296,3.851959


In [17]:
metrics = calculate_metrics_for_all_data(src_paths, gen_paths, models, 20)
pd.DataFrame(metrics)

7it [03:35, 30.80s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [03:40<00:00, 220.67s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,18.812045,13.483749,19.276484,19.37932,21.84897,14.860216,21.792383
fad,0.278577,0.633623,0.448475,0.347725,0.280164,0.527119,0.324665
kid,0.000103,0.00057,0.000326,0.000179,0.000151,0.000393,0.000167
mcd,1.009076,2.157805,1.783313,1.687125,1.368798,1.581355,1.34864
pesq,1.461782,1.126676,1.440835,1.463529,1.95465,1.740679,2.058305


In [15]:
metrics = calculate_metrics_for_all_data(src_paths, gen_paths, models, 20)
pd.DataFrame(metrics)

7it [06:39, 57.00s/it]00:00<?, ?it/s]
100%|██████████| 1/1 [06:55<00:00, 415.72s/it]


Unnamed: 0,bigvgan,diffwave,hifigan,univnet,vocos,waveglow,griffinlim
sdr,19.718965,15.746161,19.322787,18.70624,21.263581,16.972439,21.452104
fad,0.526801,2.540538,1.034007,0.903713,0.505785,1.997657,0.564923
kid,0.000221,0.004241,0.001102,0.000738,0.000267,0.003155,0.000218
mcd,1.029601,1.74764,1.415042,1.370992,1.101304,1.590749,1.035264
pesq,1.41867,1.194872,1.543204,1.574431,1.8418,1.221364,1.874869
