In [None]:
import numpy as np
import torch

import matplotlib.pyplot as plt

import soundfile as sf
import librosa
import IPython.display as ipd

import os
import sys

homedir="/home/lonce/working_local/DACSynthformer"

# Get the path of the subdirectory
subdir_path = os.path.abspath(homedir+"/BigVGAN")  # Update with actual path
# Add it to sys.path
if subdir_path not in sys.path:
    sys.path.append(subdir_path)
import bigvgan
from meldataset import get_mel_spectrogram

In [None]:
bvgmodel = bigvgan.BigVGAN.from_pretrained(homedir+'/BigVGAN/bigvgan_v2_44khz_128band_512x', use_cuda_kernel=False)
bvgmodel = bvgmodel.eval()
# Remove weight norm in the bvgmodel and set to eval mode
bvgmodel.remove_weight_norm()

if 1 : 
    print(f'bvgmodel Parameters" : {bvgmodel.h}')

In [None]:
def wav2mel(wav_data): 
    """
    Converts a WAV signal to a mel spectrogram using BigVGAN.
    """
    wav = torch.FloatTensor(wav_data).unsqueeze(0)  # Shape [1, T_time]
    mel = get_mel_spectrogram(wav, bvgmodel.h)  # Shape [1, C_mel, T_frame]
    return mel

In [None]:
destination_dir="deleteme"
wav_path=homedir+"/testdata/Lala_data/lala_wav/04.fa.wav"
wav_data, sample_rate = librosa.load(wav_path, sr=bvgmodel.h.sampling_rate, mono=True)

# Extract filename without extension
filename = os.path.splitext(os.path.basename(wav_path))[0]
mel_path = os.path.join(destination_dir, filename + ".npy")

In [None]:
bvg_mel_spec_encode = wav2mel(wav_data)
print(f'wav2mel {wav_path}')

In [None]:
# plot the BigVGAN encoded spectrogram
%matplotlib inline
bvg_mel_spec = bvg_mel_spec_encode.detach().cpu().numpy()
print(f'shape is : {bvg_mel_spec.shape}')
if bvg_mel_spec.shape[0] == 1:
    bvg_mel_spec=np.squeeze(bvg_mel_spec, axis=0) 
    
plt.figure(figsize=(10, 4))
librosa.display.specshow(bvg_mel_spec, sr=bvgmodel.h.sampling_rate, hop_length=bvgmodel.h.hop_size, x_axis="time", y_axis="mel")
plt.colorbar(label="Amplitude (dB)")
plt.title("BigVGAN encoded Mel Spectrogram")
plt.xlabel("Time (s)")
plt.ylabel("Mel Frequency")
plt.show()

In [None]:
# Now get the BVG inferenced audio 

with torch.inference_mode():
    wav_gen = bvgmodel(bvg_mel_spec_encode) # wav_gen is FloatTensor with shape [B(1), 1, T_time] and values in [-1, 1]
wav_gen_float = wav_gen.squeeze(0).cpu() # wav_gen is FloatTensor with shape [1, T_time]
adata = wav_gen_float.squeeze().numpy()

plt.figure(figsize=(10, 4))
librosa.display.waveshow(adata, sr=44100)
plt.xlabel("Time (seconds)")
plt.ylabel("Amplitude")
plt.title("BVG mel->Waveform")
plt.show()

ipd.Audio(adata, rate=44100) 

In [None]:
#now lets generate the spectrogramfrom the audio using Librosa

plt.figure(figsize=(10, 4))
mel_spec = librosa.feature.melspectrogram(y=adata,sr=bvgmodel.h.sampling_rate, n_mels=bvgmodel.h.num_mels)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
librosa.display.specshow(mel_spec_db, sr=bvgmodel.h.sampling_rate, hop_length=bvgmodel.h.hop_size, x_axis="time", y_axis="mel")
plt.colorbar(label="Amplitude (dB)")
plt.title("Librosa Mel Spectrogram fro bvg audio")
plt.xlabel("Time (s)")
plt.ylabel("Mel Frequency")
plt.show()


In [None]:
# Convert mel spectrogram back to linear STFT spectrogram
#not the DB one!!!!
mel_to_stft = librosa.feature.inverse.mel_to_stft(mel_spec, sr=bvgmodel.h.sampling_rate)

# Use Griffin-Lim to reconstruct audio from STFT
y_reconstructed = librosa.griffinlim(mel_to_stft, n_iter=64) #weirdly, the n_iter seems to have no effect

# Listen to the output
ipd.Audio(y_reconstructed, rate=bvgmodel.h.sampling_rate)

In [None]:
##  NONSENSE
# CAN LIBROSA INVERT THE SPECTROGRAM ENCODED USING BIGVGAN?????????????????????????
mel_to_stft = librosa.feature.inverse.mel_to_stft(librosa.db_to_power(bvg_mel_spec_encode.detach().cpu().numpy()), sr=bvgmodel.h.sampling_rate)

# Use Griffin-Lim to reconstruct audio from STFT
y_reconstructed = librosa.griffinlim(mel_to_stft, n_iter=64) #weirdly, the n_iter seems to have no effect

# Listen to the output
ipd.Audio(y_reconstructed, rate=bvgmodel.h.sampling_rate)
