In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import malaya_speech
import malaya_speech.config
from malaya_speech.train.model import fastspeech2
import tensorflow as tf
import numpy as np

In [4]:
config = malaya_speech.config.fastspeech2_config
config['encoder_hidden_size'] = 128
config['encoder_num_hidden_layers'] = 2
config['encoder_intermediate_size'] = 512
config['decoder_hidden_size'] = 128
config['decoder_num_hidden_layers'] = 2
config['decoder_intermediate_size'] = 512
config['hidden_dropout_prob'] = 0.1

In [5]:
# config = malaya_speech.config.fastspeech2_config
config = fastspeech2.Config(vocab_size = 66, **config)
model = fastspeech2.Model(config)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [6]:
i = tf.placeholder(tf.int32, [None, None])
lens = tf.placeholder(tf.int32, [None, None])
mel_outputs = tf.placeholder(tf.float32, [None, None, 80])
mel_lengths = tf.placeholder(tf.int32, [None])
energies = tf.placeholder(tf.float32, [None, None])
energies_lengths = tf.placeholder(tf.int32, [None])
f0s = tf.placeholder(tf.float32, [None, None])
f0s_lengths = tf.placeholder(tf.int32, [None])

In [7]:
mel_before, mel_after, duration_outputs, f0_outputs, energy_outputs = model(i, lens, f0s, energies, training = True)


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [8]:
mel_before, mel_after, duration_outputs, f0_outputs, energy_outputs

(<tf.Tensor 'model/mel_before/BiasAdd:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'model/add_3:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'model/duration_predictor/Squeeze:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'model/f0_predictor/Squeeze:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'model/energy_predictor/Squeeze:0' shape=(?, ?) dtype=float32>)

In [9]:
loss_f = tf.losses.mean_squared_error

In [10]:
log_duration = tf.math.log(
    tf.cast(tf.math.add(lens, 1), tf.float32)
)
duration_loss = loss_f(log_duration, duration_outputs)

In [11]:
max_length = tf.cast(tf.reduce_max(mel_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = mel_lengths, maxlen = max_length, dtype = tf.float32
)
mask = tf.expand_dims(mask, axis = -1)
mask

<tf.Tensor 'ExpandDims:0' shape=(?, ?, 1) dtype=float32>

In [12]:
from functools import partial
from malaya_speech.train.loss import calculate_2d_loss, calculate_3d_loss

mse_mel = partial(
    loss_f,
    weights = mask
)

In [13]:
mel_loss_before = calculate_3d_loss(mel_outputs, mel_before, mse_mel)
mel_loss_before

<tf.Tensor 'mean_squared_error_1/value:0' shape=() dtype=float32>

In [14]:
mel_loss_after = calculate_3d_loss(mel_outputs, mel_after, mse_mel)
mel_loss_after

<tf.Tensor 'mean_squared_error_2/value:0' shape=() dtype=float32>

In [15]:
max_length = tf.cast(tf.reduce_max(energies_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = energies_lengths, maxlen = max_length, dtype = tf.float32
)
energies_mel = partial(
    loss_f,
    weights = mask
)
energies_loss = calculate_2d_loss(energies, energy_outputs, energies_mel)

In [16]:
max_length = tf.cast(tf.reduce_max(f0s_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = f0s_lengths, maxlen = max_length, dtype = tf.float32
)
energies_mel = partial(
    loss_f,
    weights = mask
)
f0s_loss = calculate_2d_loss(f0s, f0_outputs, energies_mel)

In [17]:
loss = duration_loss + mel_loss_before + mel_loss_after + energies_loss + f0s_loss

In [18]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [19]:
import pickle

with open('dataset-mel.pkl', 'rb') as fopen:
    data, d = pickle.load(fopen)

In [20]:
data.keys()

dict_keys(['mel', 'text_ids', 'len_mel', 'len_text_ids', 'stop_token_target', 'f0', 'len_f0', 'energy', 'len_energy', 'g'])

In [21]:
d

array([ 0,  5,  5,  3,  6,  7,  6,  1,  2, 14,  4,  0,  0,  0, 12,  0, 10,
       10, 13, 14,  1,  6,  7,  5,  3,  6,  1,  4, 12,  3, 11,  7,  5,  3,
        0,  2,  0,  3,  8,  4, 13, 12,  8,  7,  0,  0,  3,  2,  5, 11,  5,
        7,  8, 14, 11,  6,  4,  9,  6,  7,  1,  6,  5,  5,  6,  7, 12,  8,
        6,  2,  3,  6])

In [22]:
def average_by_duration(x, durs):
    mel_len = durs.sum()
    durs_cum = np.cumsum(np.pad(durs, (1, 0)))
    
    x_char = np.zeros((durs.shape[0],), dtype=np.float32)
    for idx, start, end in zip(range(mel_len), durs_cum[:-1], durs_cum[1:]):
        values = x[start:end][np.where(x[start:end] != 0.0)[0]]
        x_char[idx] = np.mean(values) if len(values) > 0 else 0.0

    return x_char.astype(np.float32)

In [23]:
f0 = average_by_duration(data['f0'][0], d)
energy = average_by_duration(data['energy'][0], d)

In [24]:
f0.shape, energy.shape, d.shape

((72,), (72,), (72,))

In [25]:
data['text_ids'].shape

(1, 72)

In [26]:
r = sess.run([mel_before, mel_after, duration_outputs, f0_outputs, energy_outputs], 
         feed_dict = {i: data['text_ids'],
                      lens: [d],
                      energies: [energy],
                      f0s: [f0]})

In [27]:
r[0].shape, r[1].shape, r[2].shape, r[3].shape, r[4].shape

((1, 408, 80), (1, 408, 80), (1, 72), (1, 72), (1, 72))

In [31]:
%%time

r = sess.run([duration_loss, mel_loss_before, mel_loss_after, energies_loss, f0s_loss], 
         feed_dict = {i: data['text_ids'],
                      lens: [d],
                      mel_outputs:data['mel'],
                      mel_lengths:data['len_mel'][0],
                      energies: [energy],
                      energies_lengths: [len(energy)],
                      f0s: [f0],
                      f0s_lengths: [len(f0)]})

CPU times: user 205 ms, sys: 38.8 ms, total: 244 ms
Wall time: 56.4 ms


In [29]:
saver = tf.train.Saver()
saver.save(sess, 'test/model.ckpt')

'test/model.ckpt'

In [30]:
!ls -lh test

total 65960
-rw-r--r--  1 huseinzolkepli  staff    77B Apr 28 23:29 checkpoint
-rw-r--r--  1 huseinzolkepli  staff    31M Apr 28 23:29 model.ckpt.data-00000-of-00001
-rw-r--r--  1 huseinzolkepli  staff   5.4K Apr 28 23:29 model.ckpt.index
-rw-r--r--  1 huseinzolkepli  staff   1.1M Apr 28 23:29 model.ckpt.meta
