In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import malaya_speech
import malaya_speech.config
from malaya_speech.train.model import fastpitch
import tensorflow as tf
import numpy as np
import parselmouth






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
config = malaya_speech.config.fastspeech2_config
config = fastpitch.Config(vocab_size = 66, **config)
model = fastpitch.Model(config)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [5]:
i = tf.placeholder(tf.int32, [None, None])
lens = tf.placeholder(tf.int32, [None, None])
mel_outputs = tf.placeholder(tf.float32, [None, None, 80])
mel_lengths = tf.placeholder(tf.int32, [None])
pitchs = tf.placeholder(tf.float32, [None, None])
pitchs_lengths = tf.placeholder(tf.int32, [None])

In [6]:
mel_before, mel_after, duration_outputs, pitch_outputs = model(i, lens, pitchs, training = True)


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [7]:
mel_before, mel_after, duration_outputs, pitch_outputs

(<tf.Tensor 'model/mel_before/BiasAdd:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'model/add_2:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'model/duration_predictor/Squeeze:0' shape=(?, ?) dtype=float32>,
 <tf.Tensor 'model/pitch_predictor/Squeeze:0' shape=(?, ?) dtype=float32>)

In [8]:
loss_f = tf.losses.mean_squared_error

In [9]:
log_duration = tf.math.log(
    tf.cast(tf.math.add(lens, 1), tf.float32)
)
duration_loss = loss_f(log_duration, duration_outputs)

In [10]:
max_length = tf.cast(tf.reduce_max(mel_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = mel_lengths, maxlen = max_length, dtype = tf.float32
)
mask = tf.expand_dims(mask, axis = -1)
mask

<tf.Tensor 'ExpandDims:0' shape=(?, ?, 1) dtype=float32>

In [11]:
from functools import partial
from malaya_speech.train.loss import calculate_2d_loss, calculate_3d_loss

mse_mel = partial(
    loss_f,
    weights = mask
)

In [12]:
mel_loss_before = calculate_3d_loss(mel_outputs, mel_before, mse_mel)
mel_loss_before

<tf.Tensor 'mean_squared_error_1/value:0' shape=() dtype=float32>

In [13]:
mel_loss_after = calculate_3d_loss(mel_outputs, mel_after, mse_mel)
mel_loss_after

<tf.Tensor 'mean_squared_error_2/value:0' shape=() dtype=float32>

In [14]:
max_length = tf.cast(tf.reduce_max(pitchs_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = pitchs_lengths, maxlen = max_length, dtype = tf.float32
)
pitchs_mel = partial(
    loss_f,
    weights = mask
)
pitchs_loss = calculate_2d_loss(pitchs, pitch_outputs, pitchs_mel)

In [15]:
loss = duration_loss + mel_loss_before + mel_loss_after + pitchs_loss

In [16]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [17]:
import pickle

with open('dataset-mel.pkl', 'rb') as fopen:
    data, d = pickle.load(fopen)
    
with open('dataset-mel-wav.pkl', 'rb') as fopen:
    wav = pickle.load(fopen)

In [18]:
data.keys()

dict_keys(['mel', 'text_ids', 'len_mel', 'len_text_ids', 'stop_token_target', 'f0', 'len_f0', 'energy', 'len_energy', 'g'])

In [19]:
d

array([ 0,  5,  5,  3,  6,  7,  6,  1,  2, 14,  4,  0,  0,  0, 12,  0, 10,
       10, 13, 14,  1,  6,  7,  5,  3,  6,  1,  4, 12,  3, 11,  7,  5,  3,
        0,  2,  0,  3,  8,  4, 13, 12,  8,  7,  0,  0,  3,  2,  5, 11,  5,
        7,  8, 14, 11,  6,  4,  9,  6,  7,  1,  6,  5,  5,  6,  7, 12,  8,
        6,  2,  3,  6])

In [20]:
def average_by_duration(x, durs):
    mel_len = durs.sum()
    durs_cum = np.cumsum(np.pad(durs, (1, 0)))
    
    x_char = np.zeros((durs.shape[0],), dtype=np.float32)
    for idx, start, end in zip(range(mel_len), durs_cum[:-1], durs_cum[1:]):
        values = x[start:end][np.where(x[start:end] != 0.0)[0]]
        x_char[idx] = np.mean(values) if len(values) > 0 else 0.0

    return x_char.astype(np.float32)

In [21]:
f0 = average_by_duration(data['f0'][0], d)
energy = average_by_duration(data['energy'][0], d)

In [22]:
mel_len = data['mel'].shape[1]
snd = parselmouth.Sound(wav['wav'][0],sampling_frequency=22050)
pitch = snd.to_pitch(time_step=snd.duration / (mel_len + 3)
                         ).selected_array['frequency']
pitch.shape, mel_len

((408,), 408)

In [23]:
pitch = average_by_duration(pitch, d)
pitch.shape

(72,)

In [25]:
r = sess.run([mel_before, mel_after, duration_outputs, pitch_outputs], 
         feed_dict = {i: data['text_ids'],
                      lens: [d],
                      pitchs: [pitch]})

In [27]:
r[0].shape, r[1].shape, r[2].shape, r[3].shape

((1, 408, 80), (1, 408, 80), (1, 72), (1, 72))

In [28]:
r = sess.run([duration_loss, mel_loss_before, mel_loss_after, pitchs_loss], 
         feed_dict = {i: data['text_ids'],
                      lens: [d],
                      mel_outputs:data['mel'],
                      mel_lengths:data['len_mel'][0],
                      pitchs: [pitch],
                      pitchs_lengths: [len(pitch)],})

In [29]:
r

[6.451974, 2.8066676, 3.9198377, 27591.584]