In [None]:
import logging, sys
logging.disable(sys.maxsize)

from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder, TextToWaveform
from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.modules.hifigan_modules import Generator as Hifigan_generator

import IPython.display as ipd
import numpy as np
from PIL import Image
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
import torch
import json
import librosa
import os
from nemo.collections.tts.models import TwoStagesModel
from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor
from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
from nemo.collections.tts.models import HifiGanModel
import torchaudio
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
wav_featurizer = WaveformFeaturizer(sample_rate=44100, int_values=False, augmentor=None)
mel_processor = AudioToMelSpectrogramPreprocessor(
        window_size = None,
        window_stride = None,
        sample_rate=44100,
        n_window_size=2048,
        n_window_stride=512,
        window="hann",
        normalize=None,
        n_fft=None,
        preemph=None,
        features=80,
        lowfreq=0,
        highfreq=None,
        log=True,
        log_zero_guard_type="add",
        log_zero_guard_value=1e-05,
        dither=0.0,
        pad_to=1,
        frame_splicing=1,
        exact_pad=False,
        stft_exact_pad=False,
        stft_conv=False,
        pad_value=0,
        mag_power=1.0
)

In [None]:
vocoder = Hifigan_generator(
    resblock=1,
    upsample_rates=[8, 8, 4, 2],
    upsample_kernel_sizes=[16, 16, 4, 4],
    upsample_initial_channel=512,
    resblock_kernel_sizes=[3, 7, 11],
    resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
)
nemo_gen_keys = [k for k in vocoder.state_dict().keys()]
adlr_gen_ckpt = torch.load("/home/pneekhara/Checkpoints/g_01224000")['generator']
adlr_gen_keys = adlr_gen_ckpt.keys()
# print (adlr_gen_keys)
new_nemo_ckpt = {nemo_key: adlr_gen_ckpt[adlr_key] for adlr_key, nemo_key in zip(adlr_gen_keys, nemo_gen_keys)}
vocoder.load_state_dict(new_nemo_ckpt)
vocoder = vocoder.cuda().eval().half()

cfg = {'linvocoder':  {'_target_': 'nemo.collections.tts.models.two_stages.GriffinLimModel',
                     'cfg': {'n_iters': 64, 'n_fft': 2048, 'l_hop': 512}},
       'mel2spec': {'_target_': 'nemo.collections.tts.models.two_stages.MelPsuedoInverseModel',
                   'cfg': {'sampling_rate': 44100, 'n_fft': 2048, 
                           'mel_fmin': 0, 'mel_fmax': None, 'mel_freq': 80}}}
vocoder_gl = TwoStagesModel(cfg).eval().cuda()

In [None]:
vocoder_hifimixed = HifiGanModel.load_from_checkpoint("/home/pneekhara/PreTrainedModels/HifiGan--val_loss=0.09-epoch=499-last.ckpt")
vocoder_hifimixed.eval().cuda()

In [None]:
data_dir = "/home/pneekhara/Datasets/78419/Hi_Fi_TTS_v_0_backup"
experiment_base_dir = "/home/pneekhara/ExperimentsOverfitting/"
clean_other_mapping = {
    92 : 'clean',
    6097 : 'clean'
}


num_val = 3
for speaker in [ 6097, 92 ]:
    manifest_path = os.path.join(data_dir, "{}_manifest_{}_{}.json".format(speaker, clean_other_mapping[speaker], "dev"))
    val_records = []
    with open(manifest_path, "r") as f:
        for i, line in enumerate(f):
            val_records.append( json.loads(line) )
            if len(val_records) >= num_val:
                break
    print ("**** REAL VALIDATION *****")
    for vidx, val_record in enumerate(val_records):
        print("Audio file:", val_record['audio_filepath'].split("/")[-1] )
        audio_path = os.path.join( data_dir, val_record['audio_filepath'] )
        real_wav = wav_featurizer.process(audio_path)
        real_mel, _ = mel_processor.get_features(real_wav[None], torch.tensor([[real_wav.shape[0]]]).long() )
        real_mel = real_mel[0].cuda()
        with torch.no_grad():
            vocoded_audio_real_hifi = vocoder(x=real_mel.half()).squeeze(1)
            vocoded_audio_real_hifi = vocoded_audio_real_hifi.to('cpu').numpy()
            vocoded_audio_real_gl = vocoder_gl.convert_spectrogram_to_audio(spec=real_mel)
            vocoded_audio_real_hfimixed = vocoder_hifimixed.convert_spectrogram_to_audio(spec=real_mel).cpu().numpy()
        
        print ("{}) {}".format(vidx+1, val_record['text']))
        print("Ground Truth Audio for speaker:", speaker)
        ipd.display(ipd.Audio(real_wav, rate=44100))
        print("Vocoded (HiFiGAN) from real spectrogram for speaker:", speaker)
        ipd.display(ipd.Audio(vocoded_audio_real_hifi, rate=44100))
        print("Vocoded (GL) from real spectrogram for speaker:", speaker)
        ipd.display(ipd.Audio(vocoded_audio_real_gl, rate=44100))
        print("Vocoded (HiFiGAN Mixed) from real spectrogram for speaker:", speaker)
        ipd.display(ipd.Audio(vocoded_audio_real_hfimixed, rate=44100))
    print ("************************")
    continue
    

In [None]:
vocoder_hifimixed.cfg

In [None]:
torchaudio.save("sample.wav", torch.tensor(vocoded_audio_real, dtype=torch.float), 44100)