In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import pickle

metadata = pickle.load(open('metadata.pkl', "rb"))

In [4]:
import numpy as np
from math import ceil
import tensorflow as tf
from malaya_speech.train.model import fastspeech, fastvc
import malaya_speech






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.





In [5]:
def pad_seq(x, base = 32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

In [6]:
sbmt_i = metadata[0]
x_org = sbmt_i[2]
x_org, len_pad = pad_seq(x_org)
c = np.random.normal(size = (2, 512)).astype(np.float32)

In [7]:
mel = tf.placeholder(tf.float32, (None, None, 80))
ori_vector = tf.placeholder(tf.float32, (None, 512))
target_vector = tf.placeholder(tf.float32, (None, 512))
mel_lengths = tf.placeholder(tf.int32, (None))

In [8]:
config = malaya_speech.config.fastspeech_config
config['encoder_hidden_size'] = 256
config = fastspeech.Config(vocab_size = 66, **config)

In [9]:
model = fastvc.model.Model(config, 32)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [10]:
encoder_outputs, mel_before, mel_after, codes = model(mel, ori_vector, target_vector, mel_lengths)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [11]:
codes_ = model.call_second(mel_after, ori_vector, mel_lengths)

In [12]:
encoder_outputs, mel_before, mel_after, codes, codes_

(<tf.Tensor 'fastvc/concat:0' shape=(?, ?, 1024) dtype=float32>,
 <tf.Tensor 'fastvc/mel_before/BiasAdd:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'fastvc/add:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'fastvc/Encoder/TensorArrayStack/TensorArrayGatherV3:0' shape=(?, ?, 512) dtype=float32>,
 <tf.Tensor 'Encoder/TensorArrayStack/TensorArrayGatherV3:0' shape=(?, ?, 512) dtype=float32>)

In [13]:
loss_f = tf.losses.mean_squared_error
max_length = tf.cast(tf.reduce_max(mel_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = mel_lengths, maxlen = max_length, dtype = tf.float32
)
mask = tf.expand_dims(mask, axis = -1)
mask

<tf.Tensor 'ExpandDims:0' shape=<unknown> dtype=float32>

In [14]:
from functools import partial
from malaya_speech.train.loss import calculate_2d_loss, calculate_3d_loss

mse_mel = partial(
    loss_f,
    weights = mask
)

In [15]:
mel_loss_before = calculate_3d_loss(mel, mel_before, mse_mel)
mel_loss_before

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


<tf.Tensor 'mean_squared_error/value:0' shape=() dtype=float32>

In [16]:
mel_loss_after = calculate_3d_loss(mel, mel_after, mse_mel)
mel_loss_after

<tf.Tensor 'mean_squared_error_1/value:0' shape=() dtype=float32>

In [17]:
g_loss_cd = tf.losses.absolute_difference(codes, codes_)

In [18]:
loss = mel_loss_before + mel_loss_after + g_loss_cd

In [19]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [21]:
o = sess.run([encoder_outputs, mel_before, mel_after], feed_dict = {mel: [x_org] * 2,
                                                                    ori_vector: c,
                                                                    target_vector: c,
                                                                    mel_lengths: [len(x_org)] * 2})

In [22]:
o[0].shape, o[1].shape, o[2].shape

((2, 96, 1024), (2, 96, 80), (2, 96, 80))

In [26]:
o = sess.run([mel_loss_before, mel_loss_after, g_loss_cd], feed_dict = {mel: [x_org] * 2,
                                                                    ori_vector: c,
                                                                    target_vector: c,
                                                                    mel_lengths: [len(x_org)] * 2})

In [27]:
o

[1.9449615, 3.0345724, 0.5287206]

In [25]:
tf.trainable_variables()

[<tf.Variable 'fastvc/Encoder/dense/kernel:0' shape=(592, 256) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/dense/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/query/kernel:0' shape=(256, 384) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/query/bias:0' shape=(384,) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/key/kernel:0' shape=(256, 384) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/key/bias:0' shape=(384,) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/value/kernel:0' shape=(256, 384) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/value/bias:0' shape=(384,) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/output/dense/kernel:0' shape=(384, 256) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/output/dense/bias:0' shape