In [1]:
import io
import numpy as np
import tensorflow as tf
from hparams import hparams
from librosa import effects
from models import create_model
from text import text_to_sequence
from util import audio
import time

In [2]:
class Synthesizer:
  def load(self, checkpoint_path, model_name='tacotron'):
    print('Constructing model: %s' % model_name)
    inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
    input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
    with tf.variable_scope('model') as scope:
      self.model = create_model(model_name, hparams)
      self.model.initialize(inputs, input_lengths)
      self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0])

    print('Loading checkpoint: %s' % checkpoint_path)
    self.session = tf.Session()
    self.session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(self.session, checkpoint_path)


  def synthesize(self, text):
    start= time.perf_counter()
    cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    seq = text_to_sequence(text, cleaner_names)
    feed_dict = {
      self.model.inputs: [np.asarray(seq, dtype=np.int32)],
      self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
    }
    
    timepoint1 = time.perf_counter()
    print("inference prep time {:.2f}s/it".format(timepoint1 - start))
    start = time.perf_counter()
    wav = self.session.run(self.wav_output, feed_dict=feed_dict)
    timepoint1 = time.perf_counter()
    print("inference time {:.2f}s/it".format(timepoint1 - start))
    
    start = time.perf_counter()
    wav = audio.inv_preemphasis(wav)
    wav = wav[:audio.find_endpoint(wav)]
    timepoint1 = time.perf_counter()
    print("inference post proc time {:.2f}s/it".format(timepoint1 - start))
    #out = io.BytesIO()
    audio.save_wav(wav, './1.wav')
    #return out.getvalue()
    return wav

CPU times: user 15 µs, sys: 4 µs, total: 19 µs
Wall time: 22.2 µs


In [3]:
te = 'Another world, another time. This land was green and good... Until the crystal cracked'
synth = Synthesizer()
synth.load('./models/tacotron-20180906/model.ckpt')

Constructing model: tacotron
Initialized Tacotron model. Dimensions: 
  embedding:               256
  prenet out:              128
  encoder out:             256
  attention out:           256
  concat attn & out:       512
  decoder cell out:        256
  decoder out (5 frames):  400
  decoder out (1 frame):   80
  postnet out:             256
  linear out:              1025
Loading checkpoint: ./models/tacotron-20180906/model.ckpt
INFO:tensorflow:Restoring parameters from ./models/tacotron-20180906/model.ckpt


In [4]:
te = 'Another world, another time. This land was green and good... Until the crystal cracked'
speech = synth.synthesize(te)

inference prep time 0.00s/it
inference time 6.40s/it
inference post proc time 0.01s/it
CPU times: user 7.05 s, sys: 843 ms, total: 7.89 s
Wall time: 6.41 s


In [5]:
te = 'This is one very difficult sentense for me to pronounce, but I am trying my best, yo'
speech = synth.synthesize(te)

inference prep time 0.00s/it
inference time 1.47s/it
inference post proc time 0.00s/it
CPU times: user 2.47 s, sys: 593 ms, total: 3.06 s
Wall time: 1.48 s


In [6]:
te = 'Another world, another time. This land was green and good... Until the crystal cracked'
speech = synth.synthesize(te)

inference prep time 0.00s/it
inference time 0.89s/it
inference post proc time 0.00s/it
CPU times: user 2.14 s, sys: 321 ms, total: 2.46 s
Wall time: 896 ms


In [7]:
te = 'This is one very difficult sentense for me to pronounce, but I am trying my best, yo'
speech = synth.synthesize(te)

inference prep time 0.00s/it
inference time 0.89s/it
inference post proc time 0.00s/it
CPU times: user 2.06 s, sys: 380 ms, total: 2.44 s
Wall time: 891 ms


In [8]:
te = 'Now lets see how much time will it take to synthesize something conplemetely different after the system has been warmed up.'
speech = synth.synthesize(te)

inference prep time 0.00s/it
inference time 1.08s/it
inference post proc time 0.00s/it
CPU times: user 2.22 s, sys: 425 ms, total: 2.64 s
Wall time: 1.09 s
