In [None]:
import matplotlib as plt
import numpy as np 
import matplotlib.pyplot as plt
import librosa, librosa.display
import IPython.display as ipd
import pathlib
import warnings
import random
import os

!pip install wavenet_vocoder

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)
# Random seed for reproducibility
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)


In [None]:
dataset_path = pathlib.Path('../input/ljspeech11/LJSpeech-1.1/wavs')
num_of_test_files = 10 

In [None]:
wavs = []

for index, file in enumerate(dataset_path.iterdir()):
        if index > num_of_test_files:
            break
        file_wav, sr = librosa.load(file)
        wavs.append(file_wav)

# audio sample
ipd.Audio(file_wav, rate = sr)

In [None]:
# compute Linear and Mel-Spectrogram of the clips
spectrograms = list(abs(librosa.stft(file)) for file in wavs)

kwargs = {"n_mels": 80}

mel_spectrograms = list(librosa.feature.melspectrogram(file, **kwargs) for file in wavs)

In [None]:
# testing Griffin Lim
inv_gl = list(librosa.griffinlim(s) for s in spectrograms)
inv_no_phase = list(librosa.istft(s) for s in spectrograms)

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, sharex=True, sharey=True, figsize=(30,15))

librosa.display.waveshow(wavs[-1], sr=sr, color='b', ax=ax[0])
ax[0].set(title='Original', xlabel=None)
ax[0].label_outer()
librosa.display.waveshow(inv_gl[-1], sr=sr, color='g', ax=ax[1])
ax[1].set(title='Griffin-Lim reconstruction', xlabel=None)
ax[1].label_outer()
librosa.display.waveshow(inv_gl[-1], sr=sr, color='r', ax=ax[2])
ax[2].set_title('Magnitude-only istft reconstruction')


In [None]:
import torch
vocoder = torch.hub.load('descriptinc/melgan-neurips', 'load_melgan')


In [None]:
tensor = torch.Tensor(mel_spectrograms[-1])
tensor = tensor[None, :, :]
tensor.shape

In [None]:
mel = vocoder(torch.from_numpy(wavs[-1])[None])
print(mel.shape) # TODO check this to properly compute the mel spectrogram
inv_melgan = vocoder.inverse(mel).squeeze().cpu().numpy()

In [None]:
ipd.display(ipd.Audio(wavs[-1], rate=sr))
ipd.display(ipd.Audio(inv_gl[-1], rate=sr))
ipd.display(ipd.Audio(inv_no_phase[-1], rate=sr))
ipd.display(ipd.Audio(inv_melgan, rate = sr))