## Tacotron2 + WaveGlow inference

#### Import libraries

In [None]:
import os
import sys
sys.path.append('./tacotron2/')

import IPython.display as ipd

import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from text import text_to_sequence

from griffin_lim import GriffinLim
from WaveGlow import WaveGlow

#### Load WaveGlow and Tacotron2 models

In [None]:
def load_waveglow(path):
    assert os.path.isfile(path)

    param_dict = torch.load(path, map_location='cpu')
    model = param_dict['model_class']
    model.load_state_dict(param_dict['model'])
    
    return model

In [None]:
def load_tacotron2(hparams, path):
    assert os.path.isfile(path)
    
    model = Tacotron2(hparams)
    model.load_state_dict(torch.load(path)['state_dict'])
    
    return model

In [None]:
# change parameters
tacotron2_path = "tacotron2_statedict.pt"
waveglow_path = 'WaveGlow_13000.ckpt'

device = torch.device('cuda')

In [None]:
hparams = create_hparams()
sr = 22050
hparams.sampling_rate = sr
tacotron2 = load_tacotron2(hparams, tacotron2_path)
tacotron2 = tacotron2.to(device).eval()

In [None]:
waveglow = load_waveglow(waveglow_path)
waveglow = waveglow.to(device).eval()

#### GriffinLim

In [None]:
griffin_lim = GriffinLim(
    sample_rate=sr,
    num_frequencies=1025,
    frame_length=1024 / sr + 1e-6,
    frame_shift=256 / sr + 1e-6,
    mel_channels=80,
    min_frequency=0,
    max_frequency=8000,
    ref_db=20,
    min_db=-100,
    num_iter=50)

#### Text-to-audio inference function

In [None]:
def text_to_audio(text, sigma=0.5):
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.from_numpy(sequence).long().to(device)
    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, _, alignments = tacotron2.inference(sequence)
        waveglow_audio = waveglow.infer(mel_outputs_postnet, sigma=sigma)[0].data.cpu().numpy()
    griffin_audio = griffin_lim.inv_melspectrogram(mel_outputs_postnet[0].data.cpu().numpy())
    
    return waveglow_audio, griffin_audio

#### Synthesize audio from text

In [None]:
# text = "Waveglow is really awesome!"
text = "Nikita is my best friend."
waveglow_audio, griffin_audio = text_to_audio(text, 0.66)

In [None]:
# Waveglow output
ipd.Audio(waveglow_audio, rate=sr)

In [None]:
# Griffin-Lim output
ipd.Audio(griffin_audio, rate=sr)