In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/e2e-tts-dataset.pkl

In [3]:
# !pip3 install tensorflow==1.15.5

In [4]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [5]:
import tensorflow as tf

tf.compat.v1.disable_eager_execution()

In [6]:
import malaya_speech
import numpy as np

In [7]:
from malaya_speech.train.model import vits
from malaya_speech.train.model.vits import gen
from malaya_speech.train.model import fastspeech2
from malaya_speech.train.model.fastspeech2 import model_stochastic

In [8]:
hparams = vits.HParams(**malaya_speech.config.vits_base_config)
hparams

{'model': {'inter_channels': 192, 'hidden_channels': 192, 'filter_channels': 768, 'n_heads': 2, 'n_layers': 6, 'kernel_size': 3, 'p_dropout': 0.1, 'resblock': '1', 'resblock_kernel_sizes': [3, 7, 11], 'resblock_dilation_sizes': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'upsample_rates': [8, 8, 2, 2], 'upsample_initial_channel': 512, 'upsample_kernel_sizes': [16, 16, 4, 4], 'n_layers_q': 3, 'use_spectral_norm': False}, 'train': {'log_interval': 200, 'eval_interval': 1000, 'seed': 1234, 'epochs': 20000, 'learning_rate': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'batch_size': 64, 'fp16_run': True, 'lr_decay': 0.999875, 'segment_size': 8192, 'init_lr_ratio': 1, 'warmup_epochs': 0, 'c_mel': 45, 'c_kl': 1.0}, 'data': {'max_wav_value': 32768.0, 'sampling_rate': 22050, 'filter_length': 1024, 'hop_length': 256, 'win_length': 1024, 'n_mel_channels': 80, 'mel_fmin': 0.0, 'mel_fmax': None, 'add_blank': True, 'n_speakers': 0}}

In [9]:
spec_channels = hparams.data.filter_length // 2 + 1
segment_size = hparams.train.segment_size // hparams.data.hop_length
spec_channels, segment_size

(513, 32)

In [10]:
import json
import os
from glob import glob

with open('../speech/imda/output.json') as fopen:
    data = json.load(fopen)
    
wavs = glob('../speech/imda/*.WAV')

In [11]:
import pickle

with open('e2e-tts-dataset.pkl', 'rb') as fopen:
    e2e_dataset = pickle.load(fopen)

In [12]:
e2e_dataset[0]['audio']

array([8.50058964e-05, 1.55870686e-04, 1.46839827e-04, ...,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [13]:
vocab = malaya_speech.utils.text.TTS_SYMBOLS

batch = []
for w in wavs:
    t = data[os.path.split(w)[1]]
    y, _ = malaya_speech.load(w)
    batch.append((y, malaya_speech.utils.text.tts_encode(t, vocab)))

In [14]:
from librosa.filters import mel as librosa_mel_fn

melbank = librosa_mel_fn(hparams.data.sampling_rate, hparams.data.filter_length, 
                          hparams.data.n_mel_channels,hparams.data.mel_fmin, hparams.data.mel_fmax)

MEL = tf.convert_to_tensor(melbank)

def dynamic_range_compression(x, C=1, clip_val=1e-5):
    """
    PARAMS
    ------
    C: compression factor
    """
    return tf.math.log(tf.clip_by_value(x, clip_val, tf.reduce_max(x)) * C)


def dynamic_range_decompression(x, C=1):
    """
    PARAMS
    ------
    C: compression factor used to compress
    """
    return tf.exp(x) / C


def spectral_normalize(magnitudes):
    output = dynamic_range_compression(magnitudes)
    return output


def spectral_de_normalize_torch(magnitudes):
    output = dynamic_range_decompression(magnitudes)
    return output

def spectrogram_tf(audio_norm, filter_length, hop_length):
    p = int((filter_length-hop_length)/2)
    padded = tf.pad(audio_norm, [[p, p]], mode ='reflect')
    spec = tf.abs(tf.signal.stft(
        padded,
        filter_length,
        hop_length,
        fft_length=None,
        window_fn=tf.signal.hann_window,
        pad_end=False,
    ))
    spec = tf.sqrt(spec ** 2 + 1e-6)
    return spec

def spec_to_mel(spec):
    spec = tf.matmul(spec, tf.transpose(MEL))
    spec = spectral_normalize(spec)
    return spec

In [15]:
X = tf.compat.v1.placeholder(tf.float32, [None, None], name = 'X_placeholder')
X_len = tf.compat.v1.placeholder(tf.int32, [None], name = 'X_len_placeholder')

In [16]:
batch_size = tf.shape(X)[0]
features = tf.TensorArray(dtype = tf.float32, size = batch_size, dynamic_size = True, infer_shape = False)
features_len = tf.TensorArray(dtype = tf.int32, size = batch_size)

init_state = (0, features, features_len)

def condition(i, features, features_len):
    return i < batch_size

def body(i, features, features_len):
    f = spectrogram_tf(X[i, :X_len[i]], hparams.data.filter_length, hparams.data.hop_length)
    f_len = tf.shape(f)[0]
    return i + 1, features.write(i, f), features_len.write(i, f_len)

_, features, features_len = tf.while_loop(condition, body, init_state)
features_len = features_len.stack()

padded_features_mel = tf.TensorArray(dtype = tf.float32, size = batch_size)
padded_features = tf.TensorArray(dtype = tf.float32, size = batch_size)
padded_lens = tf.TensorArray(dtype = tf.int32, size = batch_size)
maxlen = tf.reduce_max(features_len)

init_state = (0, padded_features_mel, padded_features, padded_lens)

def condition(i, padded_features_mel, padded_features, padded_lens):
    return i < batch_size

def body(i, padded_features_mel, padded_features, padded_lens):
    f = features.read(i)
    f_mel = spec_to_mel(f)
    len_f = tf.shape(f)[0]
    f = tf.pad(f, [[0, maxlen - tf.shape(f)[0]], [0,0]])
    f_mel = tf.pad(f_mel, [[0, maxlen - tf.shape(f)[0]], [0,0]])
    return i + 1, padded_features_mel.write(i, f), padded_features.write(i, f), padded_lens.write(i, len_f)

_, padded_features_mel, padded_features, padded_lens = tf.while_loop(condition, body, init_state)
padded_features_mel = padded_features_mel.stack()
padded_features = padded_features.stack()
padded_lens = padded_lens.stack()
padded_lens.set_shape((None,))
padded_features_mel.set_shape((None, None, hparams.data.n_mel_channels))
padded_features.set_shape((None, None, spec_channels))
padded_features, padded_lens

(<tf.Tensor 'TensorArrayStack_2/TensorArrayGatherV3:0' shape=(?, ?, 513) dtype=float32>,
 <tf.Tensor 'TensorArrayStack_3/TensorArrayGatherV3:0' shape=(?,) dtype=int32>)

In [17]:
batch_x, batch_y = [b[0] for b in batch], [b[1] for b in batch]
x, x_len = malaya_speech.utils.padding.sequence_1d(batch_x, return_len = True)

In [18]:
y, y_len = malaya_speech.utils.padding.sequence_1d(batch_y, return_len = True)

In [19]:
model = gen.Model(len(vocab), spec_channels, segment_size, **hparams.model)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [20]:
T = tf.compat.v1.placeholder(tf.int32, [None, None])
T_lengths = tf.compat.v1.placeholder(tf.int32, [None])

In [22]:
config_fs = malaya_speech.config.fastspeech2_config
config_fs = fastspeech2.Config(
    vocab_size=len(vocab), **config_fs
)
config_fs.enable_postnet = False

In [23]:
model_fs = model_stochastic.Model(config_fs)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [24]:
lens = tf.placeholder(tf.int32, [None, None])
energies = tf.placeholder(tf.float32, [None, None])
energies_lengths = tf.placeholder(tf.int32, [None])
f0s = tf.placeholder(tf.float32, [None, None])
f0s_lengths = tf.placeholder(tf.int32, [None])

In [25]:
mel_before, _, duration_outputs, f0_outputs, energy_outputs = model_fs(
    T, lens, f0s, energies, training = True)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [26]:
from functools import partial
from malaya_speech.train import loss

loss_f = tf.losses.mean_squared_error

In [27]:
max_length = tf.cast(tf.reduce_max(energies_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = energies_lengths, maxlen = max_length, dtype = tf.float32
)
energies_mel = partial(
    loss_f,
    weights = mask
)
energies_loss = loss.calculate_2d_loss(energies, energy_outputs, energies_mel)

In [28]:
duration_loss = duration_outputs

In [29]:
max_length = tf.cast(tf.reduce_max(f0s_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = f0s_lengths, maxlen = max_length, dtype = tf.float32
)
energies_mel = partial(
    loss_f,
    weights = mask
)
f0s_loss = loss.calculate_2d_loss(f0s, f0_outputs, energies_mel)

In [30]:
energies_loss, duration_loss, f0s_loss

(<tf.Tensor 'mean_squared_error/value:0' shape=() dtype=float32>,
 <tf.Tensor 'model_1/Sum_1:0' shape=() dtype=float32>,
 <tf.Tensor 'mean_squared_error_1/value:0' shape=() dtype=float32>)

In [31]:
mel_before

<tf.Tensor 'model_1/mel_before/BiasAdd:0' shape=(?, ?, 80) dtype=float32>

In [32]:
y_hat, ids_slice = model(mel_before, padded_lens)
y_hat, ids_slice

(<tf.Tensor 'model/generator/Tanh:0' shape=(?, ?, 1) dtype=float32>,
 <tf.Tensor 'model/Cast_1:0' shape=(?,) dtype=int32>)

In [35]:
outputs = model.infer(mel_before)
outputs

<tf.Tensor 'generator/Tanh:0' shape=(?, ?, 1) dtype=float32>

In [36]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [37]:
e2e_dataset[0].keys()

dict_keys(['mel', 'text_ids', 'len_mel', 'len_text_ids', 'stop_token_target', 'f0', 'len_f0', 'energy', 'len_energy', 'f', 'alignment', 'audio', 'len_audio'])

In [38]:
e2e_dataset[0]['f0']

array([ 0.00000000e+00,  0.00000000e+00,  2.21143413e+00,  9.05393302e-01,
        9.35836256e-01,  2.09698558e+00,  2.21143413e+00,  1.79257405e+00,
        1.17414594e+00,  1.01634789e+00,  7.93029308e-01,  5.85050583e-01,
        1.24123044e-01, -1.22731633e-01,  0.00000000e+00,  0.00000000e+00,
       -1.22875735e-01, -5.07767618e-01,  2.21143413e+00,  8.78810883e-01,
       -2.46541604e-01,  9.76493359e-01, -4.39082146e-01, -1.53875756e+00,
       -1.83528638e+00,  1.70695603e+00,  2.21143413e+00,  2.21143413e+00,
        2.00131238e-01,  8.64442587e-02, -4.05265614e-02, -6.66145980e-02,
        1.74258664e-01, -3.21222767e-02,  1.52887389e-01,  1.56236038e-01,
        5.45233309e-01,  4.67070639e-01, -3.88945609e-01,  2.21143413e+00,
       -1.30476981e-01, -3.90558451e-01, -1.52522489e-01, -2.29833692e-01,
       -5.26054144e-01, -4.31837589e-01, -8.68138019e-03,  1.61830533e+00,
        8.42465878e-01, -5.97679496e-01, -1.44224799e+00,  2.21143413e+00,
        2.21143413e+00,  

In [39]:
feed_dict = {
    X: [e2e_dataset[0]['audio']],
    X_len: e2e_dataset[0]['len_audio'],
    T: [e2e_dataset[0]['text_ids']],
    T_lengths: e2e_dataset[0]['len_text_ids'],
    lens: [e2e_dataset[0]['alignment']],
    f0s: [e2e_dataset[0]['f0']],
    energies: [e2e_dataset[0]['energy']],
}

In [41]:
o_ = sess.run([y_hat, outputs, mel_before, duration_outputs, f0_outputs, energy_outputs], feed_dict = feed_dict)
o_[0].shape, o_[1].shape

((1, 8192, 1), (1, 198656, 1))

In [42]:
o_[-3]

5.1322074

In [43]:
o_[-2].shape, o_[-1].shape

((1, 160), (1, 160))