In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import pickle

metadata = pickle.load(open('metadata.pkl', "rb"))

In [4]:
# import tensorflow as tf
# tf.compat.v1.enable_eager_execution()

In [5]:
import numpy as np
from math import ceil
import tensorflow as tf
from malaya_speech.train.model import fastspeech, fastvc2 as fastvc
import malaya_speech
import random






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [6]:
speaker_model = malaya_speech.speaker_vector.deep_model('vggvox-v2')
sr = 22050

def generate(f, hop_size = 256):
    audio, _ = malaya_speech.load(f, sr = sr)
    mel = malaya_speech.featurization.universal_mel(audio)
    batch_max_steps = random.randint(16384, 110250)
    batch_max_frames = batch_max_steps // hop_size

    if len(mel) > batch_max_frames:
        interval_start = 0
        interval_end = len(mel) - batch_max_frames
        start_frame = random.randint(interval_start, interval_end)
        start_step = start_frame * hop_size
        mel = mel[start_frame : start_frame + batch_max_frames, :]
        audio = audio[start_step : start_step + batch_max_steps]

    audio_16k = malaya_speech.resample(audio, sr, 16000)
    v = speaker_model([audio_16k])

    return {
        'mel': mel,
        'mel_length': [len(mel)],
        'audio': audio,
        'v': v[0] * 30 - 3.5,
    }

In [7]:
out = generate('../speech/example-speaker/female.wav')

In [8]:
mel = tf.placeholder(tf.float32, (None, None, 80))
ori_vector = tf.placeholder(tf.float32, (None, 512))
target_vector = tf.placeholder(tf.float32, (None, 512))
mel_lengths = tf.placeholder(tf.int32, (None))

In [9]:
dim_neck = 32
dim_speaker = 512
dim_input = 80
config = malaya_speech.config.fastspeech_config
config['encoder_hidden_size'] = dim_speaker + dim_input
config['decoder_hidden_size'] = dim_speaker + dim_neck
config = fastspeech.Config(vocab_size = 1, **config)

In [11]:
model = fastvc.model.Model(dim_neck, config, dim_input, dim_speaker, 
                           skip = 6)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [12]:
encoder_outputs, mel_before, mel_after, codes = model(mel, ori_vector, target_vector, mel_lengths)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [13]:
codes_ = model.call_second(mel_after, ori_vector, mel_lengths)

In [14]:
encoder_outputs, mel_before, mel_after, codes, codes_

(<tf.Tensor 'fastvc2/GatherV2:0' shape=(?, ?, 544) dtype=float32>,
 <tf.Tensor 'fastvc2/mel_before/BiasAdd:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'fastvc2/add:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'fastvc2/Encoder/encoder_dense/BiasAdd:0' shape=(?, ?, 32) dtype=float32>,
 <tf.Tensor 'Encoder/encoder_dense/BiasAdd:0' shape=(?, ?, 32) dtype=float32>)

In [15]:
loss_f = tf.losses.mean_squared_error
max_length = tf.cast(tf.reduce_max(mel_lengths), tf.int32)
mask = tf.sequence_mask(
    lengths = mel_lengths, maxlen = max_length, dtype = tf.float32
)
mask = tf.expand_dims(mask, axis = -1)
mask

<tf.Tensor 'ExpandDims:0' shape=<unknown> dtype=float32>

In [16]:
from functools import partial
from malaya_speech.train.loss import calculate_2d_loss, calculate_3d_loss

mse_mel = partial(
    loss_f,
    weights = mask
)

In [17]:
mel_loss_before = calculate_3d_loss(mel, mel_before, mse_mel)
mel_loss_before

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


<tf.Tensor 'mean_squared_error/value:0' shape=() dtype=float32>

In [18]:
mel_loss_after = calculate_3d_loss(mel, mel_after, mse_mel)
mel_loss_after

<tf.Tensor 'mean_squared_error_1/value:0' shape=() dtype=float32>

In [19]:
g_loss_cd = tf.losses.absolute_difference(codes, codes_)

In [20]:
loss = mel_loss_before + mel_loss_after + g_loss_cd

In [21]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [22]:
out['v']

array([-1.4128842 , -2.6862488 , -1.8681563 , -3.5       , -2.9106512 ,
       -3.5       , -2.3515642 , -0.32666993, -3.5       , -3.5       ,
       -3.5       , -3.5       , -0.64072704, -0.63027906, -2.298681  ,
       -3.2594745 , -3.5       , -3.5       , -2.2684307 , -1.2218156 ,
       -3.4247363 , -3.5       , -2.8968592 , -1.7511622 , -3.5       ,
       -3.5       , -1.9463224 , -2.3840632 , -2.2428694 , -2.4803588 ,
       -3.5       , -2.6251397 , -2.1627355 , -2.5082788 , -3.3075376 ,
       -3.5       , -3.0802298 , -1.6013491 , -3.5       , -3.5       ,
       -2.6971645 , -1.2823968 , -3.29696   , -0.7826619 , -1.2687757 ,
       -2.7185807 , -3.5       , -2.9811304 , -2.5543706 , -3.2090967 ,
       -3.5       , -0.8101382 , -2.1358407 , -2.285047  , -1.3020527 ,
       -2.8850653 , -3.5       , -0.2703488 , -3.5       , -1.5684094 ,
       -3.5       , -1.5791595 , -2.1582024 , -1.8104577 , -2.2746637 ,
       -2.2444797 , -3.5       , -2.945047  , -2.714364  , -2.57

In [23]:
o = sess.run([encoder_outputs, mel_before, mel_after], feed_dict = {mel: [out['mel']],
                                                                    ori_vector: [out['v']],
                                                                    target_vector: [out['v']],
                                                                    mel_lengths: [len(out['mel'])]})

In [24]:
o[0].shape, o[1].shape, o[2].shape

((1, 143, 544), (1, 143, 80), (1, 143, 80))

In [25]:
o = sess.run([mel_loss_before, mel_loss_after, g_loss_cd], feed_dict = {mel: [out['mel']],
                                                                    ori_vector: [out['v']],
                                                                    target_vector: [out['v']],
                                                                    mel_lengths: [len(out['mel'])]})

In [26]:
o

[2.3281944, 3.4384787, 0.8277846]

In [27]:
tf.trainable_variables()

[<tf.Variable 'fastvc2/Encoder/encoder/layer_._0/attention/self/query/kernel:0' shape=(592, 384) dtype=float32>,
 <tf.Variable 'fastvc2/Encoder/encoder/layer_._0/attention/self/query/bias:0' shape=(384,) dtype=float32>,
 <tf.Variable 'fastvc2/Encoder/encoder/layer_._0/attention/self/key/kernel:0' shape=(592, 384) dtype=float32>,
 <tf.Variable 'fastvc2/Encoder/encoder/layer_._0/attention/self/key/bias:0' shape=(384,) dtype=float32>,
 <tf.Variable 'fastvc2/Encoder/encoder/layer_._0/attention/self/value/kernel:0' shape=(592, 384) dtype=float32>,
 <tf.Variable 'fastvc2/Encoder/encoder/layer_._0/attention/self/value/bias:0' shape=(384,) dtype=float32>,
 <tf.Variable 'fastvc2/Encoder/encoder/layer_._0/attention/output/dense/kernel:0' shape=(384, 592) dtype=float32>,
 <tf.Variable 'fastvc2/Encoder/encoder/layer_._0/attention/output/dense/bias:0' shape=(592,) dtype=float32>,
 <tf.Variable 'fastvc2/Encoder/encoder/layer_._0/attention/output/LayerNorm/gamma:0' shape=(592,) dtype=float32>,
 <tf.V

In [28]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, "test/model.ckpt")

'test/model.ckpt'

In [29]:
!ls -lh test
!rm -rf test

total 314264
-rw-r--r--  1 huseinzolkepli  staff    77B May 20 00:43 checkpoint
-rw-r--r--  1 huseinzolkepli  staff   150M May 20 00:43 model.ckpt.data-00000-of-00001
-rw-r--r--  1 huseinzolkepli  staff   6.3K May 20 00:43 model.ckpt.index
-rw-r--r--  1 huseinzolkepli  staff   1.4M May 20 00:43 model.ckpt.meta
