In [56]:
import torch
from train import load_checkpoint
from glow import WaveGlow
from scipy.io.wavfile import write
import numpy as np
import IPython.display as ipd
import matplotlib.pylab as plt

from tacotron2.model import Tacotron2
from tacotron2.hparams import create_hparams
from tacotron2.train import load_model as load_tacotron
from tacotron2.text import text_to_sequence

#### Trained on roughly 20 minutes of Audio Recordings of my own voice with a limited vocabulary

In [89]:
with open("data/phrases.txt", encoding="utf8") as fh:
	txt = fh.readlines()
# Get a count of available words
from collections import defaultdict
word_count = defaultdict(lambda: 0)
for sent in txt:
	for word in sent.split(' '):
		word_count[
			word
			.replace(u'\n', '')
			.replace('.', '')
			.replace(',', '')
			.replace('?', '')
		] += 1
word_count = dict(sorted(word_count.items(), key=lambda item: item[1], reverse=True))
print(len(word_count))
word_count

578


{'you': 103,
 'to': 70,
 'the': 69,
 'I': 64,
 'is': 55,
 'a': 48,
 'What': 33,
 'are': 29,
 'door': 27,
 'have': 26,
 'in': 22,
 'open': 20,
 'do': 18,
 'it': 17,
 'of': 17,
 'there': 17,
 'me': 16,
 'He': 15,
 'for': 14,
 "I'm": 14,
 'get': 14,
 'not': 13,
 'your': 13,
 'this': 11,
 'he': 11,
 'How': 11,
 'It': 11,
 'time': 11,
 'him': 11,
 'my': 10,
 'as': 10,
 'be': 10,
 'You': 10,
 'opened': 9,
 'book': 8,
 'that': 8,
 'at': 8,
 'When': 8,
 'come': 8,
 'pencils': 7,
 'The': 7,
 'would': 7,
 'we': 7,
 "It's": 7,
 'like': 7,
 'car': 7,
 'on': 7,
 'opening': 7,
 'tell': 7,
 'go': 7,
 'This': 6,
 'Is': 6,
 'Are': 6,
 'eat': 6,
 'said': 6,
 'many': 6,
 'There': 6,
 'few': 6,
 "won't": 6,
 'Have': 6,
 'about': 6,
 'Could': 6,
 'can': 6,
 'than': 6,
 'books': 5,
 'good': 5,
 'very': 5,
 'girl': 5,
 'what': 5,
 'Where': 5,
 'Do': 5,
 'any': 5,
 'has': 5,
 'see': 5,
 'We': 5,
 "don't": 5,
 'did': 5,
 'Get': 5,
 'English': 5,
 'all': 5,
 'pencil': 4,
 'these': 4,
 'going': 4,
 'To': 4,
 'ou

In [57]:
SAMPLING_RATE = 44100
WAVEGLOW_MODEL = "models/mbailey_wg.pt"
TACOTRON_MODEL = "models/mbailey_tt.pt"

In [58]:

# Create our inference Tacotron model
hparams = create_hparams()
hparams.sampling_rate = SAMPLING_RATE

tacotron = load_tacotron(hparams)
tacotron.load_state_dict(torch.load(TACOTRON_MODEL)['state_dict'])
_ = tacotron.cuda().eval().half()

In [59]:

waveglow = torch.load(WAVEGLOW_MODEL)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()

In [103]:
text = "What time is it?"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)
).cuda().long()
mel_outputs, mel_outputs_postnet, _, alignments = tacotron.inference(sequence)
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
audio_numpy = audio[0].data.cpu().numpy()
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)