In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import pickle

metadata = pickle.load(open('metadata.pkl', "rb"))

In [4]:
import numpy as np
from math import ceil
import tensorflow as tf
from malaya_speech.train.model import fastspeech, fastvc2 as fastvc
import malaya_speech






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [5]:
sbmt_i = metadata[0]
x_org = sbmt_i[2]
c = np.random.normal(size = (2, 512)).astype(np.float32)

In [6]:
mel = tf.placeholder(tf.float32, (None, None, 80))
ori_vector = tf.placeholder(tf.float32, (None, 512))
target_vector = tf.placeholder(tf.float32, (None, 512))
mel_lengths = tf.placeholder(tf.int32, (None))

In [7]:
dim_neck = 32
dim_speaker = 256
config = malaya_speech.config.fastspeech_config
config['encoder_hidden_size'] = dim_speaker + 80
config['decoder_hidden_size'] = dim_speaker + dim_neck
config['output_attentions'] = True
config = fastspeech.Config(vocab_size = 1, **config)

In [8]:
model = fastvc.model.Model(dim_neck, config, dim_speaker = dim_speaker)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [9]:
encoder_outputs, mel_before, mel_after, codes, attention = model(mel, ori_vector, target_vector, mel_lengths)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [10]:
attention

(<tf.Tensor 'fastvc/Encoder/encoder/layer_._0/attention/self/dropout/dropout/mul_1:0' shape=(?, 2, ?, ?) dtype=float32>,
 <tf.Tensor 'fastvc/Encoder/encoder/layer_._1/attention/self/dropout_3/dropout/mul_1:0' shape=(?, 2, ?, ?) dtype=float32>,
 <tf.Tensor 'fastvc/Encoder/encoder/layer_._2/attention/self/dropout_6/dropout/mul_1:0' shape=(?, 2, ?, ?) dtype=float32>,
 <tf.Tensor 'fastvc/Encoder/encoder/layer_._3/attention/self/dropout_9/dropout/mul_1:0' shape=(?, 2, ?, ?) dtype=float32>)

In [11]:
codes_, attention_ = model.call_second(mel_after, ori_vector, mel_lengths)

In [12]:
loss_f = tf.losses.absolute_difference
max_length = tf.cast(tf.reduce_max(mel_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = mel_lengths, maxlen = max_length, dtype = tf.float32
)
mask = tf.expand_dims(mask, axis = -1)
mask

<tf.Tensor 'ExpandDims:0' shape=<unknown> dtype=float32>

In [13]:
from functools import partial
from malaya_speech.train.loss import calculate_2d_loss, calculate_3d_loss

mse_mel = partial(
    loss_f,
    weights = mask
)

In [14]:
mel_loss_before = calculate_3d_loss(mel, mel_before, mse_mel)
mel_loss_before

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


<tf.Tensor 'absolute_difference/value:0' shape=() dtype=float32>

In [15]:
mel_loss_after = calculate_3d_loss(mel, mel_after, mse_mel)
mel_loss_after

<tf.Tensor 'absolute_difference_1/value:0' shape=() dtype=float32>

In [16]:
g_loss_cd = tf.losses.absolute_difference(codes, codes_)

In [17]:
g_loss_attention = tf.losses.absolute_difference(attention, attention_)
g_loss_attention

<tf.Tensor 'absolute_difference_3/value:0' shape=() dtype=float32>

In [18]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [19]:
o = sess.run([encoder_outputs, mel_before, mel_after, attention], feed_dict = {mel: [x_org] * 2,
                                                                    ori_vector: c,
                                                                    target_vector: c,
                                                                    mel_lengths: [len(x_org)] * 2})

In [20]:
o[0].shape, o[1].shape, o[2].shape

((2, 90, 288), (2, 90, 80), (2, 90, 80))

In [21]:
o[3][0].shape

(2, 2, 90, 90)

In [22]:
o[3][1].shape

(2, 2, 90, 90)

In [23]:
o[3][2].shape

(2, 2, 90, 90)

In [24]:
def average_by_duration(x, durs):
    mel_len = durs.sum()
    durs_cum = np.cumsum(np.pad(durs, (1, 0)))
    
    x_char = np.zeros((durs.shape[0],), dtype=np.float32)
    for idx, start, end in zip(range(mel_len), durs_cum[:-1], durs_cum[1:]):
        values = x[start:end][np.where(x[start:end] != 0.0)[0]]
        x_char[idx] = np.mean(values) if len(values) > 0 else 0.0

    return x_char.astype(np.float32)

def get_duration_from_alignment(alignment):
    D = np.array([0 for _ in range(np.shape(alignment)[0])])

    for i in range(np.shape(alignment)[1]):
        max_index = list(alignment[:, i]).index(alignment[:, i].max())
        D[max_index] = D[max_index] + 1

    return D

In [25]:
get_duration_from_alignment(o[3][-1][0,0])

array([2, 3, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 3, 0, 1, 0, 1, 2, 1, 2, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 3, 2, 0, 1, 0, 2, 0, 2, 1, 0, 0, 0, 6, 1, 1,
       2, 0, 1, 0, 1, 0, 0, 2, 1, 2, 0, 0, 0, 1, 3, 1, 0, 2, 1, 0, 0, 0,
       0, 1, 1, 6, 0, 1, 1, 1, 1, 0, 0, 3, 0, 0, 1, 1, 1, 0, 4, 3, 1, 0,
       1, 3])

In [26]:
o = sess.run([mel_loss_before, mel_loss_after, g_loss_cd, g_loss_attention], feed_dict = {mel: [x_org] * 2,
                                                                    ori_vector: c,
                                                                    target_vector: c,
                                                                    mel_lengths: [len(x_org)] * 2})

In [27]:
o

[1.0245656, 1.3156967, 0.79576325, 0.0027572378]