In [1]:
# For maths and audio processing
import numpy as np
from scipy.io.wavfile import write

# For audio in notebook
from IPython.display import Audio

In [2]:
import torch

# Grabbing tacotron2 model and loading it on the cpu
tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2', pretrained=False)

checkpoint_tt2 = torch.hub.load_state_dict_from_url('https://api.ngc.nvidia.com/v2/models/nvidia/tacotron2_pyt_ckpt_fp32/versions/19.09.0/files/nvidia_tacotron2pyt_fp32_20190427', map_location="cpu")

# Unwrap the DistributedDataParallel module
# module.layer -> layer
state_dict_tt2 = {key.replace("module.", ""): value for key, value in checkpoint_tt2["state_dict"].items()}

# Apply the state dict to the model
tacotron2.load_state_dict(state_dict_tt2)

tacotron2 = tacotron2.to('cpu')
tacotron2.eval()

# Grabbing waveglow model and loading it on the cpu
waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow', pretrained=False)
checkpoint_wg = torch.hub.load_state_dict_from_url('https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ckpt_fp32/versions/19.09.0/files/nvidia_waveglowpyt_fp32_20190427', map_location="cpu")

# Unwrap the DistributedDataParallel module
# module.layer -> layer
state_dict_wg = {key.replace("module.", ""): value for key, value in checkpoint_wg["state_dict"].items()}

# Apply the state dict to the model
waveglow.load_state_dict(state_dict_wg)

waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cpu')
waveglow.eval()

print('Models loaded.')

Using cache found in /Users/boj/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub
Using cache found in /Users/boj/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub


Models loaded.


In [5]:
text = "Garfunkel is pleased."

In [7]:
# preprocessing
sequence = np.array(tacotron2.text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)

# run the models
with torch.no_grad():
    _, mel, _, _ = tacotron2.infer(sequence)
    audio = waveglow.infer(mel)
audio_numpy = audio[0].data.cpu().numpy()
rate = 22050

write("audio.wav", rate, audio_numpy)

#Audio(audio_numpy, rate=rate)

In [53]:
string_1 = 'kjdbgkbgkjdbg\nkjabfkjbd\nsjngnd.'
string_2 = ' '.join(string_1.split())

print(string_2)

kjdbgkbgkjdbg kjabfkjbd sjngnd.
