In [1]:
import os
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [2]:
from malaya_speech.train.model import aligner, fastvc, fastspeech
import malaya_speech
import tensorflow as tf
import matplotlib.pyplot as plt






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [3]:
from scipy.stats import betabinom
import numpy as np

def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0):
    x = np.arange(0, phoneme_count)
    mel_text_probs = []
    for i in range(1, mel_count + 1):
        a, b = scaling_factor * i, scaling_factor * (mel_count + 1 - i)
        mel_i_prob = betabinom(phoneme_count, a, b).pmf(x)
        mel_text_probs.append(mel_i_prob)
    return np.array(mel_text_probs)

attn_prior = np.expand_dims(beta_binomial_prior_distribution(50, 100), 0).astype(np.float32)
attn_prior.shape, attn_prior.min(), attn_prior.max()

((1, 100, 50), 4.968e-41, 0.6666667)

In [4]:
i = tf.placeholder(tf.int32, [None, None])
lens = tf.placeholder(tf.int32, [None])
mel = tf.placeholder(tf.float32, [None, None, 80])
mel_lengths = tf.placeholder(tf.int32, [None])
prior = tf.placeholder(tf.float32, [None, None, None])

In [5]:
encoder = aligner.AlignmentEncoder(vocab_size = 100, vocab_embedding = 512)



Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [6]:
attention_mask = tf.expand_dims(tf.math.not_equal(i, 0), -1)
attention_mask

<tf.Tensor 'ExpandDims:0' shape=(?, ?, 1) dtype=bool>

In [7]:
attn_soft, attn_logprob = encoder(mel, i, mask = attention_mask, attn_prior = prior)
attn_soft, attn_logprob

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


(<tf.Tensor 'AlignmentEncoder/Softmax:0' shape=(?, 1, ?, ?) dtype=float32>,
 <tf.Tensor 'AlignmentEncoder/Identity:0' shape=(?, 1, ?, ?) dtype=float32>)

In [8]:
attn_hard = encoder.get_hard_attention(attn_soft, lens, mel_lengths)
attn_hard

<tf.Tensor 'PyFunc:0' shape=(?, 1, ?, ?) dtype=float32>

In [9]:
forwardsum_loss = aligner.forwardsum_loss(attn_logprob, lens, mel_lengths)
forwardsum_loss

Instructions for updating:
Use `tf.cast` instead.



<tf.Tensor 'truediv:0' shape=() dtype=float32>

In [10]:
bin_loss = aligner.bin_loss(attn_hard, attn_soft)
bin_loss

<tf.Tensor 'truediv_1:0' shape=() dtype=float32>

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate = 1e-3).minimize(forwardsum_loss)

In [None]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [None]:
import pickle

with open('dataset-mel.pkl', 'rb') as fopen:
    data, d = pickle.load(fopen)

In [None]:
data.keys()

In [None]:
l_text = data['len_text_ids'][0,0]
l_mel = data['len_mel'][0,0]
l_text, l_mel

In [None]:
prior_ = np.expand_dims(beta_binomial_prior_distribution(l_text, l_mel), 0).astype(np.float32)
prior_.shape

In [None]:
o = sess.run([attn_soft, attn_logprob], feed_dict = {i: data['text_ids'],
                                                    lens: data['len_text_ids'][:,0],
                                                    mel: data['mel'],
                                                    mel_lengths: data['len_mel'][:,0],
                                                    prior: prior_})
o[0].shape, o[1].shape

In [None]:
for no in range(200):
    o = sess.run([forwardsum_loss, bin_loss, optimizer], feed_dict = {i: data['text_ids'],
                                                        lens: data['len_text_ids'][:,0],
                                                        mel: data['mel'],
                                                        mel_lengths: data['len_mel'][:,0],
                                                        prior: prior_})
    print(no, o)

In [None]:
o = sess.run([attn_soft, attn_logprob, attn_hard], feed_dict = {i: data['text_ids'],
                                                    lens: data['len_text_ids'][:,0],
                                                    mel: data['mel'],
                                                    mel_lengths: data['len_mel'][:,0],
                                                    prior: prior_})
o[0].shape, o[1].shape, o[2].shape

In [None]:
o[0][0,0].shape

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
ax.set_title('Alignment steps')
im = ax.imshow(
    o[0][0,0],
    aspect='auto',
    origin='lower',
    interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
ax.set_title('Alignment steps')
im = ax.imshow(
    o[2][0,0],
    aspect='auto',
    origin='lower',
    interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
plt.tight_layout()
plt.show()

In [None]:
tf.trainable_variables()

In [None]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'test/model.ckpt')

In [None]:
!ls -lh test
!rm -rf test