In [None]:
import logging, sys
logging.disable(sys.maxsize)

from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder, TextToWaveform
from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.modules.hifigan_modules import Generator as Hifigan_generator

import IPython.display as ipd
import numpy as np
from PIL import Image
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
import torch
import json
import librosa
import os
from nemo.collections.tts.models import TwoStagesModel
from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor
from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
from nemo.collections.asr.models import EncDecSpeakerLabelModel
from nemo.collections.tts.models import HifiGanModel

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
def infer(spec_gen_model, vocoder_model, str_input, speaker = None):
    parser_model = spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().cuda()
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker = speaker)
        if isinstance(vocoder_model, Hifigan_generator):
            audio = vocoder_model(x=spectrogram.half()).squeeze(1)
        else:
            audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio


wav_featurizer = WaveformFeaturizer(sample_rate=44100, int_values=False, augmentor=None)
mel_processor = AudioToMelSpectrogramPreprocessor(
        window_size = None,
        window_stride = None,
        sample_rate=44100,
        n_window_size=2048,
        n_window_stride=512,
        window="hann",
        normalize=None,
        n_fft=None,
        preemph=None,
        features=80,
        lowfreq=0,
        highfreq=None,
        log=True,
        log_zero_guard_type="add",
        log_zero_guard_value=1e-05,
        dither=0.0,
        pad_to=1,
        frame_splicing=1,
        exact_pad=False,
        stft_exact_pad=False,
        stft_conv=False,
        pad_value=0,
        mag_power=1.0
)

In [None]:
cfg = {'linvocoder':  {'_target_': 'nemo.collections.tts.models.two_stages.GriffinLimModel',
                     'cfg': {'n_iters': 64, 'n_fft': 2048, 'l_hop': 512}},
       'mel2spec': {'_target_': 'nemo.collections.tts.models.two_stages.MelPsuedoInverseModel',
                   'cfg': {'sampling_rate': 44100, 'n_fft': 2048, 
                           'mel_fmin': 0, 'mel_fmax': None, 'mel_freq': 80}}}
vocoder_gl = TwoStagesModel(cfg).eval().cuda()

# mixed vocoder - trained on multiple speakers
vocoder = HifiGanModel.load_from_checkpoint("/home/pneekhara/PreTrainedModels/HifiGan--val_loss=0.09-epoch=649-last.ckpt")
vocoder.eval().cuda()

In [None]:
spec_model_original = FastPitchModel.restore_from('/home/pneekhara/PreTrainedModels/FastPitch.nemo')
spec_model_original.eval().cuda()

data_dir = "/home/pneekhara/Datasets/78419/Hi_Fi_TTS_v_0_backup"
experiment_base_dir = "/home/pneekhara/ExperimentsAutomatedNewPitchStats/"
clean_other_mapping = {
    92 : 'clean',
    6097 : 'clean'
}

full_data_ckpts = {
    92 : '/home/pneekhara/Checkpoints/FastPitchSpeaker92Epoch999.ckpt',
    6097 : '/home/pneekhara/Checkpoints/FastPitch6097Epoch999.ckpt'
}

finetuning_ckpts = {
    "200" : "/home/pneekhara/CheckpointsOverfitting/FastPitch--v_loss=1.19-epoch=199-last.ckpt",
    "1000" : "/home/pneekhara/CheckpointsOverfitting/FastPitch--v_loss=1.16-epoch=999-last.ckpt",
    "10,000": "/home/pneekhara/CheckpointsOverfitting/FastPitch--v_loss=1.48-epoch=9999-last.ckpt"
}

num_val = 7

for speaker in [92]:
    manifest_path = os.path.join(data_dir, "{}_manifest_{}_{}.json".format(speaker, clean_other_mapping[speaker], "dev"))
    val_records = []
    with open(manifest_path, "r") as f:
        for i, line in enumerate(f):
            val_records.append( json.loads(line) )
            if len(val_records) >= num_val:
                break
    
    for vidx, val_record in enumerate(val_records):
        print("Audio path:", val_record['audio_filepath'] )
        audio_path = os.path.join( data_dir, val_record['audio_filepath'] )
        
        real_wav = wav_featurizer.process(audio_path)
        real_mel, _ = mel_processor.get_features(real_wav[None], torch.tensor([[real_wav.shape[0]]]).long() )
        real_mel = real_mel[0].cuda()
        with torch.no_grad():
            vocoded_audio_real = vocoder.convert_spectrogram_to_audio(spec=real_mel).cpu().numpy()
        print (vidx+1, val_record['text'])
        print(vidx+1, "Ground Truth Audio for speaker:", speaker)
        ipd.display(ipd.Audio(real_wav, rate=44100))
        print(vidx+1, "Ground Truth spectrogram vocoded (HiFiGAN):", speaker)
        ipd.display(ipd.Audio(vocoded_audio_real, rate=44100))
        
        print ("----------------------")
    
        for key in finetuning_ckpts:      
            spec_model = FastPitchModel.load_from_checkpoint(finetuning_ckpts[key]).eval().cuda()
            _speaker=None
            
            print (vidx+1, "Synthesized. Finetuned for {} epochs | ".format(key) , val_record['text'])
            _, audio = infer(spec_model, vocoder, val_record['text'], speaker = _speaker)
            ipd.display(ipd.Audio(audio, rate=44100))
        print ("*******************"*5)
        print ("*******************"*5)