In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import tensorflow.compat.v1 as tf

tf.disable_eager_execution()

In [4]:
import malaya_speech
import malaya_speech.config
from malaya_speech.train.model import fastspeech2
from malaya_speech.train.model.fastspeech2 import model_stochastic
import numpy as np

In [5]:
config = malaya_speech.config.fastspeech2_config
config['encoder_hidden_size'] = 128
config['encoder_num_hidden_layers'] = 2
config['encoder_intermediate_size'] = 512
config['decoder_hidden_size'] = 128
config['decoder_num_hidden_layers'] = 2
config['decoder_intermediate_size'] = 512
config['hidden_dropout_prob'] = 0.1

In [6]:
# config = malaya_speech.config.fastspeech2_config
config = fastspeech2.Config(vocab_size = 66, **config)
model = model_stochastic.Model(config)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [7]:
i = tf.placeholder(tf.int32, [None, None])
lens = tf.placeholder(tf.int32, [None, None])
mel_outputs = tf.placeholder(tf.float32, [None, None, 80])
mel_lengths = tf.placeholder(tf.int32, [None])
energies = tf.placeholder(tf.float32, [None, None])
energies_lengths = tf.placeholder(tf.int32, [None])
f0s = tf.placeholder(tf.float32, [None, None])
f0s_lengths = tf.placeholder(tf.int32, [None])

In [8]:
mel_before, mel_after, duration_outputs, f0_outputs, energy_outputs = model(i, lens, f0s, energies, training = True)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
mel_before, mel_after, duration_outputs, f0_outputs, energy_outputs

(<tf.Tensor 'model/mel_before/BiasAdd:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'model/add_3:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'model/Sum_1:0' shape=() dtype=float32>,
 <tf.Tensor 'model/f0_predictor/Squeeze:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'model/energy_predictor/Squeeze:0' shape=(?, ?) dtype=float32>)

In [10]:
o = model.inference(i, 1.0, [1.0], [1.0], 1.0)
o

(<tf.Tensor 'mel_before_1/BiasAdd:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'add_7:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'Cast_3:0' shape=(?, ?) dtype=int32>,
 <tf.Tensor 'mul_7:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'mul_8:0' shape=(?, ?) dtype=float32>)

In [12]:
loss_f = tf.losses.mean_squared_error

In [13]:
duration_loss = duration_outputs

In [14]:
max_length = tf.cast(tf.reduce_max(mel_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = mel_lengths, maxlen = max_length, dtype = tf.float32
)
mask = tf.expand_dims(mask, axis = -1)
mask

<tf.Tensor 'ExpandDims_12:0' shape=(?, ?, 1) dtype=float32>

In [15]:
from functools import partial
from malaya_speech.train import loss

mse_mel = partial(
    loss_f,
    weights = mask
)

In [16]:
mel_loss_before = loss.calculate_3d_loss(mel_outputs, mel_before, mse_mel)
mel_loss_before

<tf.Tensor 'mean_squared_error/value:0' shape=() dtype=float32>

In [17]:
mel_loss_after = loss.calculate_3d_loss(mel_outputs, mel_after, mse_mel)
mel_loss_after

<tf.Tensor 'mean_squared_error_1/value:0' shape=() dtype=float32>

In [18]:
max_length = tf.cast(tf.reduce_max(energies_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = energies_lengths, maxlen = max_length, dtype = tf.float32
)
energies_mel = partial(
    loss_f,
    weights = mask
)
energies_loss = loss.calculate_2d_loss(energies, energy_outputs, energies_mel)

In [19]:
max_length = tf.cast(tf.reduce_max(f0s_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = f0s_lengths, maxlen = max_length, dtype = tf.float32
)
energies_mel = partial(
    loss_f,
    weights = mask
)
f0s_loss = loss.calculate_2d_loss(f0s, f0_outputs, energies_mel)

In [20]:
loss = duration_loss + mel_loss_before + mel_loss_after + energies_loss + f0s_loss

In [21]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [22]:
import pickle

with open('dataset-mel.pkl', 'rb') as fopen:
    data, d = pickle.load(fopen)

In [23]:
data.keys()

dict_keys(['mel', 'text_ids', 'len_mel', 'len_text_ids', 'stop_token_target', 'f0', 'len_f0', 'energy', 'len_energy', 'g'])

In [24]:
d.shape

(72,)

In [25]:
def average_by_duration(x, durs):
    mel_len = durs.sum()
    durs_cum = np.cumsum(np.pad(durs, (1, 0)))
    
    x_char = np.zeros((durs.shape[0],), dtype=np.float32)
    for idx, start, end in zip(range(mel_len), durs_cum[:-1], durs_cum[1:]):
        values = x[start:end][np.where(x[start:end] != 0.0)[0]]
        x_char[idx] = np.mean(values) if len(values) > 0 else 0.0

    return x_char.astype(np.float32)

In [26]:
f0 = average_by_duration(data['f0'][0], d)
energy = average_by_duration(data['energy'][0], d)

In [27]:
f0.shape, energy.shape, d.shape

((72,), (72,), (72,))

In [28]:
data['text_ids'].shape

(1, 72)

In [29]:
r = sess.run([mel_before, mel_after, duration_outputs, f0_outputs, energy_outputs], 
         feed_dict = {i: data['text_ids'],
                      lens: [d],
                      energies: [energy],
                      f0s: [f0]})

In [30]:
r[0].shape, r[1].shape, r[2].shape, r[3].shape, r[4].shape

((1, 408, 80), (1, 408, 80), (), (1, 72), (1, 72))

In [31]:
%%time

r = sess.run([duration_loss, mel_loss_before, mel_loss_after, energies_loss, f0s_loss], 
         feed_dict = {i: data['text_ids'],
                      lens: [d],
                      mel_outputs:data['mel'],
                      mel_lengths:data['len_mel'][0],
                      energies: [energy],
                      energies_lengths: [len(energy)],
                      f0s: [f0],
                      f0s_lengths: [len(f0)]})

CPU times: user 3.11 s, sys: 75.8 ms, total: 3.18 s
Wall time: 2.95 s


In [32]:
%%time

r = sess.run(o, 
         feed_dict = {i: data['text_ids']})

CPU times: user 1.05 s, sys: 39.6 ms, total: 1.09 s
Wall time: 988 ms


In [33]:
r

(array([[[ 1.7530556e-01, -2.0076416e+00, -1.5113738e-01, ...,
          -7.6994710e-02,  3.3198804e-01, -1.6002080e+00],
         [ 1.0174407e+00, -1.0283543e-01,  1.3536279e-01, ...,
           3.8780543e-01, -3.6809766e-01, -3.8695681e-01],
         [-7.7233887e-01, -3.4405744e-01, -1.0052812e+00, ...,
          -1.8492843e+00, -4.6884894e-01,  1.0801215e+00],
         ...,
         [-1.2138747e+00, -1.6034847e-03, -6.5687692e-01, ...,
          -9.2790979e-01, -5.3099513e-01,  1.3595790e+00],
         [-1.2276285e+00, -6.2218518e-03, -6.8765569e-01, ...,
          -1.1537774e+00, -4.5435274e-01,  1.2463319e+00],
         [-9.9839664e-01,  2.3969002e-02, -7.1537161e-01, ...,
          -1.0789440e+00, -5.6597364e-01,  1.3471968e+00]]], dtype=float32),
 array([[[ 1.0910996 , -2.1609724 ,  0.43045345, ...,  0.25504762,
          -0.79802865, -1.1828123 ],
         [ 2.2565598 ,  0.0429959 ,  0.7502131 , ...,  1.9181694 ,
          -0.706168  ,  0.9901773 ],
         [-0.17074943, -0.34

In [34]:
saver = tf.train.Saver()
saver.save(sess, 'test/model.ckpt')

'test/model.ckpt'

In [35]:
!ls -lh test

total 93816
-rw-r--r--  1 huseinzolkepli  staff    77B Aug 14 20:06 checkpoint
-rw-r--r--  1 huseinzolkepli  staff    39M Aug 14 20:06 model.ckpt.data-00000-of-00001
-rw-r--r--  1 huseinzolkepli  staff    16K Aug 14 20:06 model.ckpt.index
-rw-r--r--  1 huseinzolkepli  staff   7.2M Aug 14 20:06 model.ckpt.meta
