In [None]:
import logging, sys
logging.disable(sys.maxsize)

import IPython.display as ipd
import numpy as np
import torch
import json
import librosa
import os

from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.modules.hifigan_modules import Generator as Hifigan_generator
from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor
from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
from nemo.collections.tts.models import HifiGanModel


os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
def infer(spec_gen_model, vocoder_model, str_input, speaker=None):
    parser_model = spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().cuda()
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker = speaker)
        if isinstance(vocoder_model, Hifigan_generator):
            audio = vocoder_model(x=spectrogram.half()).squeeze(1)
        else:
            audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio


wav_featurizer = WaveformFeaturizer(sample_rate=44100, int_values=False, augmentor=None)
mel_processor = AudioToMelSpectrogramPreprocessor(
        window_size = None,
        window_stride = None,
        sample_rate=44100,
        n_window_size=2048,
        n_window_stride=512,
        window="hann",
        normalize=None,
        n_fft=None,
        preemph=None,
        features=80,
        lowfreq=0,
        highfreq=None,
        log=True,
        log_zero_guard_type="add",
        log_zero_guard_value=1e-05,
        dither=0.0,
        pad_to=1,
        frame_splicing=1,
        exact_pad=False,
        stft_exact_pad=False,
        stft_conv=False,
        pad_value=0,
        mag_power=1.0
)

In [None]:
def get_synthesis_models(fastpitch_model_path, hifigan_model_path):
    print (fastpitch_model_path)
    print (hifigan_model_path)
    spec_model = FastPitchModel.load_from_checkpoint(fastpitch_model_path)
    spec_model.cuda().eval()
    vocoder = HifiGanModel.load_from_checkpoint(hifigan_model_path)
    vocoder.cuda().eval()
    
    return spec_model, vocoder

def play_validation_samples(spec_model, vocoder):
    val_manifest_file = "/home/pneekhara/JonData/val_list.json"
    with open(val_manifest_file) as f:
        all_lines = f.read().split("\n")
        for line in all_lines:
            if len(line) > 0:
                record = json.loads(line)

                print("Jon's actual voice")
                ipd.display(ipd.Audio(record['audio_filepath']))

                real_wav = wav_featurizer.process(record['audio_filepath'])
                real_mel, _ = mel_processor.get_features(real_wav[None], torch.tensor([[real_wav.shape[0]]]).long() )
                real_mel = real_mel[0]
                real_mel = real_mel.cuda()
                with torch.no_grad():
                    vocoded_audio_real = vocoder.convert_spectrogram_to_audio(spec=real_mel).cpu().numpy()
                    print("Ground truth spectrogram vocoded (HiFiGAN):")
                    ipd.display(ipd.Audio(vocoded_audio_real, rate=44100))
                text_to_generate = record['text']
                spec, audio = infer(spec_model, vocoder, text_to_generate, speaker=0)
                print("Synthetic audio")
                ipd.display(ipd.Audio(audio, rate=44100))


In [None]:
fastpitch_nomix_finetuned_ckpt = "/home/pneekhara/JonCkpts/FastPitchNoMixJon.ckpt"
fastpitch_mix_finetuned_ckpt = "/home/pneekhara/JonCkpts/FastPitchMixJon.ckpt"
hifigan_nomix_finetuned_ckpt = "/home/pneekhara/JonCkpts/HifiGanNoMix.ckpt"
hifigan_mix_finetuned_ckpt = "/home/pneekhara/JonCkpts/HifiGanMix.ckpt"

In [None]:
spec_model, vocoder = get_synthesis_models(fastpitch_nomix_finetuned_ckpt, hifigan_mix_finetuned_ckpt)

In [None]:
text_to_generate = "This is a synthetic voice of Jon generated by a model trained on just fifteen minutes of data."
spec, audio = infer(spec_model, vocoder, text_to_generate, speaker = 1)
ipd.display(ipd.Audio(audio, rate=44100))

In [None]:
play_validation_samples(spec_model, vocoder)