In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import tensorflow as tf
tf.compat.v1.enable_eager_execution()

In [4]:
import malaya_speech
import malaya_speech.config
from malaya_speech.train.model import lightspeech, fastspeech2
import tensorflow as tf
import numpy as np

In [5]:
config = malaya_speech.config.fastspeech2_config
config['encoder_hidden_act'] = 'relu'
config['hidden_dropout_prob'] = 0.1
config = fastspeech2.Config(vocab_size = 66, **config)
model = lightspeech.Model(config)




In [6]:
config.decoder_self_attention_params.__dict__

{'n_speakers': 1,
 'hidden_size': 384,
 'num_hidden_layers': 4,
 'num_attention_heads': 2,
 'attention_head_size': 192,
 'intermediate_size': 1024,
 'intermediate_kernel_size': 3,
 'hidden_act': 'mish',
 'output_attentions': False,
 'output_hidden_states': False,
 'initializer_range': 0.02,
 'hidden_dropout_prob': 0.1,
 'attention_probs_dropout_prob': 0.1,
 'layer_norm_eps': 1e-05,
 'max_position_embeddings': 2048}

In [7]:
import pickle

with open('dataset-mel.pkl', 'rb') as fopen:
    data, d = pickle.load(fopen)
    
with open('dataset-mel-wav.pkl', 'rb') as fopen:
    wav = pickle.load(fopen)
    
data.keys()

dict_keys(['mel', 'text_ids', 'len_mel', 'len_text_ids', 'stop_token_target', 'f0', 'len_f0', 'energy', 'len_energy', 'g'])

In [8]:
def average_by_duration(x, durs):
    mel_len = durs.sum()
    durs_cum = np.cumsum(np.pad(durs, (1, 0)))
    
    x_char = np.zeros((durs.shape[0],), dtype=np.float32)
    for idx, start, end in zip(range(mel_len), durs_cum[:-1], durs_cum[1:]):
        values = x[start:end][np.where(x[start:end] != 0.0)[0]]
        x_char[idx] = np.mean(values) if len(values) > 0 else 0.0

    return x_char.astype(np.float32)

In [9]:
f0 = average_by_duration(data['f0'][0], d)
energy = average_by_duration(data['energy'][0], d)
f0.shape, energy.shape, d.shape

((72,), (72,), (72,))

In [10]:
i = tf.convert_to_tensor([data['text_ids'][0],data['text_ids'][0]])
lens = tf.convert_to_tensor([d, d])
mel_outputs = tf.convert_to_tensor([data['mel'].astype(np.float32)[0],data['mel'].astype(np.float32)[0]])
mel_lengths = tf.convert_to_tensor([408,408])
wavs = tf.convert_to_tensor([wav['wav'].astype(np.float32)] * 2)[:,0]
wavs.shape

TensorShape([Dimension(2), Dimension(104448)])

In [11]:
f0s = tf.convert_to_tensor([f0.astype(np.float32),f0.astype(np.float32)])
energies = tf.convert_to_tensor([energy.astype(np.float32),energy.astype(np.float32)])

In [13]:
%%time

mel_before, mel_after, duration_outputs, f0_outputs, energy_outputs = model(i, lens, f0s, energies, training = True)

CPU times: user 538 ms, sys: 55.4 ms, total: 593 ms
Wall time: 206 ms


In [13]:
loss_f = tf.losses.mean_squared_error

In [14]:
log_duration = tf.math.log(
    tf.cast(tf.math.add(lens, 1), tf.float32)
)
duration_loss = loss_f(log_duration, duration_outputs)

In [15]:
max_length = tf.cast(tf.reduce_max(mel_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = mel_lengths, maxlen = max_length, dtype = tf.float32
)
mask = tf.expand_dims(mask, axis = -1)
mask

<tf.Tensor: id=2967, shape=(2, 408, 1), dtype=float32, numpy=
array([[[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],


In [16]:
from functools import partial
from malaya_speech.train.loss import calculate_2d_loss, calculate_3d_loss

mse_mel = partial(
    loss_f,
    weights = mask
)

In [17]:
mel_loss_before = calculate_3d_loss(mel_outputs, mel_before, mse_mel)
mel_loss_before

<tf.Tensor: id=3006, shape=(), dtype=float32, numpy=39.706734>

In [18]:
mel_loss_after = calculate_3d_loss(mel_outputs, mel_after, mse_mel)
mel_loss_after

<tf.Tensor: id=3045, shape=(), dtype=float32, numpy=40.695656>