In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import pickle

metadata = pickle.load(open('metadata.pkl', "rb"))

In [4]:
# import tensorflow as tf
# tf.compat.v1.enable_eager_execution()

In [5]:
import numpy as np
from math import ceil
import tensorflow as tf
from malaya_speech.train.model import fastspeech, fastvc
import malaya_speech
import random






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [6]:
speaker_model = malaya_speech.speaker_vector.deep_model('vggvox-v2')
sr = 22050

def generate(f, hop_size = 256):
    audio, _ = malaya_speech.load(f, sr = sr)
    mel = malaya_speech.featurization.universal_mel(audio)
    batch_max_steps = random.randint(16384, 110250)
    batch_max_frames = batch_max_steps // hop_size

    if len(mel) > batch_max_frames:
        interval_start = 0
        interval_end = len(mel) - batch_max_frames
        start_frame = random.randint(interval_start, interval_end)
        start_step = start_frame * hop_size
        mel = mel[start_frame : start_frame + batch_max_frames, :]
        audio = audio[start_step : start_step + batch_max_steps]

    audio_16k = malaya_speech.resample(audio, sr, 16000)
    v = speaker_model([audio_16k])

    return {
        'mel': mel,
        'mel_length': [len(mel)],
        'audio': audio,
        'v': v[0] * 30 - 3.5,
    }

In [21]:
out = generate('../speech/example-speaker/female.wav')

In [8]:
mel = tf.placeholder(tf.float32, (None, None, 80))
ori_vector = tf.placeholder(tf.float32, (None, 512))
target_vector = tf.placeholder(tf.float32, (None, 512))
mel_lengths = tf.placeholder(tf.int32, (None))

In [9]:
dim_neck = 32
dim_speaker = 512
config = malaya_speech.config.fastspeech_config
config['encoder_hidden_size'] = dim_speaker + 80
config['decoder_hidden_size'] = dim_speaker + dim_neck
config = fastspeech.Config(vocab_size = 1, **config)

In [10]:
model = fastvc.model.Model(dim_neck, config, use_position_embedding = False)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [11]:
encoder_outputs, mel_before, mel_after, codes = model(mel, ori_vector, target_vector, mel_lengths)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [12]:
codes_ = model.call_second(mel_after, ori_vector, mel_lengths)

In [13]:
encoder_outputs, mel_before, mel_after, codes, codes_

(<tf.Tensor 'fastvc/concat:0' shape=(?, ?, 544) dtype=float32>,
 <tf.Tensor 'fastvc/mel_before/BiasAdd:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'fastvc/add:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'fastvc/Encoder/encoder_dense/BiasAdd:0' shape=(?, ?, 32) dtype=float32>,
 <tf.Tensor 'Encoder/encoder_dense/BiasAdd:0' shape=(?, ?, 32) dtype=float32>)

In [14]:
loss_f = tf.losses.mean_squared_error
max_length = tf.cast(tf.reduce_max(mel_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = mel_lengths, maxlen = max_length, dtype = tf.float32
)
mask = tf.expand_dims(mask, axis = -1)
mask

<tf.Tensor 'ExpandDims:0' shape=<unknown> dtype=float32>

In [15]:
from functools import partial
from malaya_speech.train.loss import calculate_2d_loss, calculate_3d_loss

mse_mel = partial(
    loss_f,
    weights = mask
)

In [16]:
mel_loss_before = calculate_3d_loss(mel, mel_before, mse_mel)
mel_loss_before

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


<tf.Tensor 'mean_squared_error/value:0' shape=() dtype=float32>

In [17]:
mel_loss_after = calculate_3d_loss(mel, mel_after, mse_mel)
mel_loss_after

<tf.Tensor 'mean_squared_error_1/value:0' shape=() dtype=float32>

In [18]:
g_loss_cd = tf.losses.absolute_difference(codes, codes_)

In [19]:
loss = mel_loss_before + mel_loss_after + g_loss_cd

In [20]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [23]:
out['v']

array([-1.5948188 , -2.5590937 , -1.6135337 , -3.5       , -2.3923047 ,
       -3.5       , -2.5054214 , -0.53937674, -3.5       , -3.1332235 ,
       -3.5       , -3.5       , -0.32021284, -1.0619524 , -3.0251315 ,
       -2.6596863 , -3.5       , -3.5       , -2.3167567 , -1.3759847 ,
       -3.5       , -3.5       , -3.5       , -1.7767136 , -3.5       ,
       -3.5       , -2.6126437 , -2.5392375 , -2.083774  , -2.317572  ,
       -3.5       , -3.1178448 , -2.4783316 , -1.9898603 , -3.157979  ,
       -3.5       , -2.1449203 , -0.5724082 , -3.5       , -3.5       ,
       -2.822224  , -1.4411256 , -3.1397915 , -0.9673116 , -1.7097532 ,
       -3.4196122 , -3.5       , -2.9842734 , -2.016067  , -2.4568152 ,
       -3.5       , -0.7693229 , -2.3576033 , -2.1704278 , -1.0863576 ,
       -3.1307166 , -3.5       , -0.27580285, -3.5       , -1.343863  ,
       -3.5       , -1.4165156 , -1.8104421 , -1.9786164 , -2.6418002 ,
       -1.2647204 , -3.5       , -2.6869736 , -2.4480016 , -2.82

In [24]:
o = sess.run([encoder_outputs, mel_before, mel_after], feed_dict = {mel: [out['mel']],
                                                                    ori_vector: [out['v']],
                                                                    target_vector: [out['v']],
                                                                    mel_lengths: [len(out['mel'])]})

In [25]:
o[0].shape, o[1].shape, o[2].shape

((1, 385, 544), (1, 385, 80), (1, 385, 80))

In [26]:
o = sess.run([mel_loss_before, mel_loss_after, g_loss_cd], feed_dict = {mel: [out['mel']],
                                                                    ori_vector: [out['v']],
                                                                    target_vector: [out['v']],
                                                                    mel_lengths: [len(out['mel'])]})

In [27]:
o

[3.2110395, 4.290681, 0.8723303]

In [28]:
tf.trainable_variables()

[<tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/query/kernel:0' shape=(592, 384) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/query/bias:0' shape=(384,) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/key/kernel:0' shape=(592, 384) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/key/bias:0' shape=(384,) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/value/kernel:0' shape=(592, 384) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/self/value/bias:0' shape=(384,) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/output/dense/kernel:0' shape=(384, 592) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/output/dense/bias:0' shape=(592,) dtype=float32>,
 <tf.Variable 'fastvc/Encoder/encoder/layer_._0/attention/output/LayerNorm/gamma:0' shape=(592,) dtype=float32>,
 <tf.Variable '

In [29]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, "test/model.ckpt")

'test/model.ckpt'

In [30]:
!ls -lh test
!rm -rf test

total 310368
-rw-r--r--  1 huseinzolkepli  staff    77B May 19 23:06 checkpoint
-rw-r--r--  1 huseinzolkepli  staff   150M May 19 23:06 model.ckpt.data-00000-of-00001
-rw-r--r--  1 huseinzolkepli  staff   6.3K May 19 23:06 model.ckpt.index
-rw-r--r--  1 huseinzolkepli  staff   1.4M May 19 23:06 model.ckpt.meta
