# TTS Inference

This notebook can be used to generate audio samples using either NeMo's pretrained models or after training NeMo TTS models.

In [20]:
from ipywidgets import Select, HBox, Label
from IPython.display import display

from nemo.collections.tts.models import FastPitchModel
from omegaconf import OmegaConf, open_dict
import torch
from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder, TextToWaveform

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

def load_vocoder_model(audio_generator):
    RequestPseudoInverse = False
    TwoStagesModel = False
    strict=True
    
    if audio_generator == "waveglow":
        from nemo.collections.tts.models import WaveGlowModel
        pretrained_model = "tts_waveglow"
        strict=False
    elif audio_generator == "squeezewave":
        from nemo.collections.tts.models import SqueezeWaveModel
        pretrained_model = "tts_squeezewave"
    elif audio_generator == "uniglow":
        from nemo.collections.tts.models import UniGlowModel
        pretrained_model = "tts_uniglow"
    elif audio_generator == "melgan":
        from nemo.collections.tts.models import MelGanModel
        pretrained_model = "tts_melgan"
    elif audio_generator == "hifigan":
        from nemo.collections.tts.models import HifiGanModel
        pretrained_model = "tts_hifigan"
    elif audio_generator == "griffin-lim":
        from nemo.collections.tts.models import TwoStagesModel
        cfg = {'linvocoder':  {'_target_': 'nemo.collections.tts.models.two_stages.GriffinLimModel',
                             'cfg': {'n_iters': 64, 'n_fft': 1024, 'l_hop': 256}},
               'mel2spec': {'_target_': 'nemo.collections.tts.models.two_stages.MelPsuedoInverseModel',
                           'cfg': {'sampling_rate': 22050, 'n_fft': 1024, 
                                   'mel_fmin': 0, 'mel_fmax': 8000, 'mel_freq': 80}}}
        model = TwoStagesModel(cfg)            
        TwoStagesModel = True
    else:
        raise NotImplementedError

    if not TwoStagesModel:
        model = Vocoder.from_pretrained(pretrained_model, strict=strict)
    return model

In [16]:


spec_gen = FastPitchModel.load_from_checkpoint("/data1/paarth/Experiments/nemo/rough/FastPitch/2021-07-22_19-37-42/checkpoints/FastPitch--v_loss=0.69-epoch=729-last.ckpt")
spec_gen = spec_gen.eval().cuda()

[NeMo W 2021-07-24 14:53:53 modelPT:139] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharWithPriorAndPitchDataset
      manifest_filepath: /data1/paarth/nemo_filelists/lj_train_nemo.txt
      max_duration: null
      min_duration: 0.1
      int_values: false
      normalize: true
      sample_rate: 22050
      trim: false
      sup_data_path: /data1/paarth/Datasets/PriorsLJ
      n_window_stride: 256
      n_window_size: 1024
      pitch_fmin: 80
      pitch_fmax: 640
      pitch_avg: 211.27540199742586
      pitch_std: 52.1851002822779
      vocab:
        notation: phonemes
        punct: true
        spaces: true
        stresses: true
        add_blank_at: None
        pad_with_space: true
        chars: true
        improved_version_g2p: true
    dataloader_params

[NeMo I 2021-07-24 14:53:54 features:252] PADDING: 1
[NeMo I 2021-07-24 14:53:54 features:269] STFT using torch
[NeMo I 2021-07-24 14:53:55 cloud:56] Found existing object /home/paarth/.cache/torch/NeMo/NeMo_1.0.2/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
[NeMo I 2021-07-24 14:53:55 cloud:62] Re-using file from: /home/paarth/.cache/torch/NeMo/NeMo_1.0.2/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
[NeMo I 2021-07-24 14:53:55 common:676] Instantiating model from pre-trained checkpoint


[NeMo W 2021-07-24 14:53:59 modelPT:139] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2021-07-24 14:53:59 modelPT:146] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2021-07-24 14:53:59 features:252] PADDING: 0
[NeMo I 2021-07-24 14:53:59 features:269] STFT using torch


[NeMo W 2021-07-24 14:53:59 features:230] Using torch_stft is deprecated and will be removed in 1.1.0. Please set stft_conv and stft_exact_pad to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2021-07-24 14:53:59 features:252] PADDING: 0
[NeMo I 2021-07-24 14:53:59 features:269] STFT using torch
[NeMo I 2021-07-24 14:54:01 modelPT:439] Model HifiGanModel was successfully restored from /home/paarth/.cache/torch/NeMo/NeMo_1.0.2/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.


In [33]:
vocoder = load_vocoder_model("hifigan").eval().cuda()

[NeMo I 2021-07-24 14:57:26 cloud:56] Found existing object /home/paarth/.cache/torch/NeMo/NeMo_1.0.2/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
[NeMo I 2021-07-24 14:57:26 cloud:62] Re-using file from: /home/paarth/.cache/torch/NeMo/NeMo_1.0.2/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
[NeMo I 2021-07-24 14:57:26 common:676] Instantiating model from pre-trained checkpoint


[NeMo W 2021-07-24 14:57:30 modelPT:139] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2021-07-24 14:57:30 modelPT:146] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2021-07-24 14:57:30 features:252] PADDING: 0
[NeMo I 2021-07-24 14:57:30 features:269] STFT using torch


[NeMo W 2021-07-24 14:57:30 features:230] Using torch_stft is deprecated and will be removed in 1.1.0. Please set stft_conv and stft_exact_pad to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2021-07-24 14:57:30 features:252] PADDING: 0
[NeMo I 2021-07-24 14:57:30 features:269] STFT using torch
[NeMo I 2021-07-24 14:57:32 modelPT:439] Model HifiGanModel was successfully restored from /home/paarth/.cache/torch/NeMo/NeMo_1.0.2/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.


In [34]:
def infer(emodel, spec_gen_model, vocder_model, str_input):
    parser_model = emodel or spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        if emodel is None:
            spectrogram = spec_gen.generate_spectrogram(tokens=parsed)
            audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
        else:
            spectrogram = None
            audio = emodel.convert_text_to_waveform(tokens=parsed)[0]
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

Now that everything is set up, let's give an input that we want our models to speak

In [35]:
text_to_generate = "This is a natural sounding text to speech synthesis model."
spec, audio = infer(None, spec_gen, vocoder, text_to_generate)

# Results

After our model generates the audio, let's go ahead and play it. We can also visualize the spectrogram that was produced from the first stage model if a spectrogram generator was used.

In [36]:
import IPython.display as ipd
import numpy as np
from PIL import Image
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt

ipd.Audio(audio, rate=22050)

In [None]:
%matplotlib inline
if spec is not None:
    imshow(spec, origin="lower")
    plt.show()