In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import tensorflow as tf
import malaya_speech
import numpy as np

In [4]:
from malaya_speech.train.model.vits import model
from malaya_speech.train.model import vits

In [5]:
hparams = vits.HParams(**malaya_speech.config.vits_base_config)
hparams

{'model': {'inter_channels': 192, 'hidden_channels': 192, 'filter_channels': 768, 'n_heads': 2, 'n_layers': 6, 'kernel_size': 3, 'p_dropout': 0.1, 'resblock': '1', 'resblock_kernel_sizes': [3, 7, 11], 'resblock_dilation_sizes': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'upsample_rates': [8, 8, 2, 2], 'upsample_initial_channel': 512, 'upsample_kernel_sizes': [16, 16, 4, 4], 'n_layers_q': 3, 'use_spectral_norm': False}, 'train': {'log_interval': 200, 'eval_interval': 1000, 'seed': 1234, 'epochs': 20000, 'learning_rate': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'batch_size': 64, 'fp16_run': True, 'lr_decay': 0.999875, 'segment_size': 8192, 'init_lr_ratio': 1, 'warmup_epochs': 0, 'c_mel': 45, 'c_kl': 1.0}, 'data': {'max_wav_value': 32768.0, 'sampling_rate': 22050, 'filter_length': 1024, 'hop_length': 256, 'win_length': 1024, 'n_mel_channels': 80, 'mel_fmin': 0.0, 'mel_fmax': None, 'add_blank': True, 'n_speakers': 0}}

In [6]:
spec_channels = hparams.data.filter_length // 2 + 1
segment_size = hparams.train.segment_size // hparams.data.hop_length
spec_channels, segment_size

(513, 32)

In [7]:
import json
import os
from glob import glob

with open('../speech/imda/output.json') as fopen:
    data = json.load(fopen)
    
wavs = glob('../speech/imda/*.WAV')

In [8]:
vocab = malaya_speech.utils.text.TTS_SYMBOLS

batch = []
for w in wavs:
    t = data[os.path.split(w)[1]]
    y, _ = malaya_speech.load(w)
    batch.append((y, malaya_speech.utils.text.tts_encode(t, vocab)))

In [9]:
from librosa.filters import mel as librosa_mel_fn

melbank = librosa_mel_fn(hparams.data.sampling_rate, hparams.data.filter_length, 
                          hparams.data.n_mel_channels,hparams.data.mel_fmin, hparams.data.mel_fmax)

MEL = tf.convert_to_tensor(melbank)

def dynamic_range_compression(x, C=1, clip_val=1e-5):
    """
    PARAMS
    ------
    C: compression factor
    """
    return tf.log(tf.clip_by_value(x, clip_val, tf.reduce_max(x)) * C)


def dynamic_range_decompression(x, C=1):
    """
    PARAMS
    ------
    C: compression factor used to compress
    """
    return tf.exp(x) / C


def spectral_normalize(magnitudes):
    output = dynamic_range_compression(magnitudes)
    return output


def spectral_de_normalize_torch(magnitudes):
    output = dynamic_range_decompression(magnitudes)
    return output

def spectrogram_tf(audio_norm, filter_length, hop_length):
    p = int((filter_length-hop_length)/2)
    padded = tf.pad(audio_norm, [[p, p]], mode ='reflect')
    spec = tf.abs(tf.signal.stft(
        padded,
        filter_length,
        hop_length,
        fft_length=None,
        window_fn=tf.signal.hann_window,
        pad_end=False,
    ))
    spec = tf.sqrt(spec ** 2 + 1e-6)
    return spec

def spec_to_mel(spec):
    spec = tf.matmul(spec, tf.transpose(MEL))
    spec = spectral_normalize(spec)
    return spec

In [10]:
X = tf.compat.v1.placeholder(tf.float32, [None, None], name = 'X_placeholder')
X_len = tf.compat.v1.placeholder(tf.int32, [None], name = 'X_len_placeholder')

In [11]:
batch_size = tf.shape(X)[0]
features = tf.TensorArray(dtype = tf.float32, size = batch_size, dynamic_size = True, infer_shape = False)
features_len = tf.TensorArray(dtype = tf.int32, size = batch_size)

init_state = (0, features, features_len)

def condition(i, features, features_len):
    return i < batch_size

def body(i, features, features_len):
    f = spectrogram_tf(X[i, :X_len[i]], hparams.data.filter_length, hparams.data.hop_length)
    f_len = tf.shape(f)[0]
    return i + 1, features.write(i, f), features_len.write(i, f_len)

_, features, features_len = tf.while_loop(condition, body, init_state)
features_len = features_len.stack()

padded_features_mel = tf.TensorArray(dtype = tf.float32, size = batch_size)
padded_features = tf.TensorArray(dtype = tf.float32, size = batch_size)
padded_lens = tf.TensorArray(dtype = tf.int32, size = batch_size)
maxlen = tf.reduce_max(features_len)

init_state = (0, padded_features_mel, padded_features, padded_lens)

def condition(i, padded_features_mel, padded_features, padded_lens):
    return i < batch_size

def body(i, padded_features_mel, padded_features, padded_lens):
    f = features.read(i)
    f_mel = spec_to_mel(f)
    len_f = tf.shape(f)[0]
    f = tf.pad(f, [[0, maxlen - tf.shape(f)[0]], [0,0]])
    f_mel = tf.pad(f_mel, [[0, maxlen - tf.shape(f)[0]], [0,0]])
    return i + 1, padded_features_mel.write(i, f), padded_features.write(i, f), padded_lens.write(i, len_f)

_, padded_features_mel, padded_features, padded_lens = tf.while_loop(condition, body, init_state)
padded_features_mel = padded_features_mel.stack()
padded_features = padded_features.stack()
padded_lens = padded_lens.stack()
padded_lens.set_shape((None,))
padded_features_mel.set_shape((None, None, hparams.data.n_mel_channels))
padded_features.set_shape((None, None, spec_channels))
padded_features, padded_lens

(<tf.Tensor 'TensorArrayStack_2/TensorArrayGatherV3:0' shape=(?, ?, 513) dtype=float32>,
 <tf.Tensor 'TensorArrayStack_3/TensorArrayGatherV3:0' shape=(?,) dtype=int32>)

In [12]:
batch_x, batch_y = [b[0] for b in batch], [b[1] for b in batch]
x, x_len = malaya_speech.utils.padding.sequence_1d(batch_x, return_len = True)

In [13]:
y, y_len = malaya_speech.utils.padding.sequence_1d(batch_y, return_len = True)

In [14]:
model = vits.Model(len(vocab), spec_channels, segment_size, **hparams.model)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [15]:
T = tf.placeholder(tf.int32, [None, None])
T_lengths = tf.placeholder(tf.int32, [None])

In [16]:
y_hat, l_length, attn, ids_slice, x_mask, z_mask,\
      (z, z_p, m_p, logs_p, m_q, logs_q) = model(T, T_lengths, padded_features, padded_lens)


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)


In [17]:
outputs = model.infer(T, T_lengths)
outputs

(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)
(?, ?, 2) (?, ?, 1)
piecewise_rational_quadratic_transform_tf linear
(?,) (?,)


(<tf.Tensor 'generator/Tanh:0' shape=(?, ?, 1) dtype=float32>,
 <tf.Tensor 'transpose:0' shape=(?, ?, ?) dtype=float32>,
 <tf.Tensor 'ExpandDims:0' shape=(?, ?, 1) dtype=float32>,
 (<tf.Tensor 'residual_coupling_block/residual_coupling_layer/concat:0' shape=(?, ?, 192) dtype=float32>,
  <tf.Tensor 'add:0' shape=(?, ?, 192) dtype=float32>,
  <tf.Tensor 'MatMul:0' shape=(?, ?, 192) dtype=float32>,
  <tf.Tensor 'MatMul_1:0' shape=(?, ?, 192) dtype=float32>))

In [18]:
outputs[:2]

(<tf.Tensor 'generator/Tanh:0' shape=(?, ?, 1) dtype=float32>,
 <tf.Tensor 'transpose:0' shape=(?, ?, ?) dtype=float32>)

In [19]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [21]:
o_ = sess.run([y_hat, outputs[0], outputs[1]], feed_dict = {X: x,
                                                        X_len: x_len,
                                                        T: y,
                                                        T_lengths: y_len})
o_[0].shape, o_[1].shape

((3, 8192, 1), (3, 34304, 1))