In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import malaya_speech
from pysptk import sptk
import numpy as np

In [4]:
import tensorflow as tf

# tf.compat.v1.enable_eager_execution()

In [5]:
vggvox_v2 = malaya_speech.gender.deep_model(model = 'vggvox-v2')
speaker_model = malaya_speech.speaker_vector.deep_model('vggvox-v2')

In [6]:
freqs = {'female': [100, 600], 'male': [50, 250]}

In [7]:
from scipy.signal import get_window
from scipy import signal
import soundfile as sf
import random

sr = 22050

def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

b, a = butter_highpass(30, sr, order=5)

In [8]:
from math import ceil

def speaker_normalization(f0, index_nonzero, mean_f0, std_f0):
    f0 = f0.astype(float).copy()
    f0[index_nonzero] = (f0[index_nonzero] - mean_f0) / std_f0
    f0[index_nonzero] = np.clip(f0[index_nonzero], -3, 4)
    return f0

def preprocess_wav(x):
    if x.shape[0] % 256 == 0:
        x = np.concatenate((x, np.array([1e-06])), axis=0)
    y = signal.filtfilt(b, a, x)
    wav = y * 0.96 + (np.random.uniform(size = y.shape[0]) - 0.5)*1e-06
    return wav

def get_f0(wav, lo, hi):
    f0_rapt = sptk.rapt(wav.astype(np.float32)*32768, sr, 256, min=lo, max=hi, otype=2)
    index_nonzero = (f0_rapt != -1e10)
    mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(f0_rapt[index_nonzero])
    return speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)

def pad_seq(x, base = 8):
    len_out = int(base * ceil(float(x.shape[0]) / base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), x.shape[0]

def get_speech(f, hop_size = 256):
    x, fs = malaya_speech.load(f, sr = sr)
    wav = preprocess_wav(x)
    lo, hi = freqs.get(vggvox_v2(x), [50, 250])
    f0 = np.expand_dims(get_f0(wav, lo, hi), -1)
    mel = malaya_speech.featurization.universal_mel(wav)
    
    batch_max_steps = random.randint(16384, 110250)
    batch_max_frames = batch_max_steps // hop_size
    
    if len(mel) > batch_max_frames:
        interval_start = 0
        interval_end = len(mel) - batch_max_frames
        start_frame = random.randint(interval_start, interval_end)
        start_step = start_frame * hop_size
        wav = wav[start_step : start_step + batch_max_steps]
        mel = mel[start_frame : start_frame + batch_max_frames, :]
        f0 = f0[start_frame : start_frame + batch_max_frames, :]
        
    v = speaker_model([wav])[0]
    v = v / v.max()
    return wav, mel, f0, v

In [9]:
wav, mel, f0, v = get_speech('../speech/example-speaker/female.wav')
wav_1, mel_1, f0_1, v_1 = get_speech('../speech/example-speaker/khalil-nooh.wav')

In [10]:
mels, mel_lens = malaya_speech.padding.sequence_nd([mel, mel_1], dim = 0, return_len = True)
mels.shape, mel_lens

((2, 412, 80), [412, 136])

In [11]:
f0s, f0_lens = malaya_speech.padding.sequence_nd([f0, f0_1], dim = 0, return_len = True)
f0s.shape, f0_lens

((2, 412, 1), [412, 136])

In [12]:
vs = malaya_speech.padding.sequence_nd([v, v_1], dim = 0)
vs.shape

(2, 512)

In [13]:
X = tf.placeholder(tf.float32, [None, None, 80])
X_f0 = tf.placeholder(tf.float32, [None, None, 1])
len_X = tf.placeholder(tf.int32, [None])
V = tf.placeholder(tf.float32, [None, 512])

In [14]:
# from malaya_speech.train.model.fastspeechsplit import inference as fastspeechsplit
from malaya_speech.train.model import speechsplit, speechsplitformer































The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.









In [15]:
hparams = speechsplit.hparams
encoder_config = malaya_speech.config.transformer_config.copy()
decoder_config = malaya_speech.config.transformer_config.copy()
encoder_config['activation'] = 'mish'
decoder_config['activation'] = 'mish'
interplnr = speechsplit.InterpLnr(hparams)
model = speechsplitformer.Model(encoder_config, decoder_config, hparams)
model_F0 = speechsplitformer.Model_F0(encoder_config, decoder_config, hparams)







In [16]:
bottleneck_speaker = tf.keras.layers.Dense(hparams.dim_spk_emb)
speaker_dim = bottleneck_speaker(V)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [17]:
x_f0_intrp = interplnr(tf.concat([X, X_f0], axis = -1), len_X)
x_f0_intrp.shape

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


TensorShape([Dimension(None), Dimension(None), Dimension(81)])

In [18]:
f0_org_intrp = speechsplit.quantize_f0_tf(x_f0_intrp[:,:,-1])
x_f0_intrp_org = tf.concat((x_f0_intrp[:,:,:-1], f0_org_intrp), axis=-1)

In [19]:
f0_org = speechsplit.quantize_f0_tf(X_f0[:,:,0])

In [20]:
f0_org_intrp, x_f0_intrp_org, X, speaker_dim, f0_org

(<tf.Tensor 'Reshape_1:0' shape=(?, ?, 257) dtype=float32>,
 <tf.Tensor 'concat_1:0' shape=(?, ?, 337) dtype=float32>,
 <tf.Tensor 'Placeholder:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'dense/BiasAdd:0' shape=(?, 128) dtype=float32>,
 <tf.Tensor 'Reshape_3:0' shape=(?, ?, 257) dtype=float32>)

In [21]:
o = model(x_f0_intrp_org, X, speaker_dim, len_X)
o







Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.














Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.








(<tf.Tensor 'speechsplit/encoder_7/Transformer/encode/encoder_dense_1/BiasAdd:0' shape=(?, ?, 8) dtype=float32>,
 <tf.Tensor 'speechsplit/encoder_7/Transformer/encode/encoder_dense_2/BiasAdd:0' shape=(?, ?, 32) dtype=float32>,
 <tf.Tensor 'speechsplit/encoder_t/Transformer/encode/encoder_dense/BiasAdd:0' shape=(?, ?, 1) dtype=float32>,
 <tf.Tensor 'speechsplit/concat:0' shape=(?, ?, 169) dtype=float32>,
 <tf.Tensor 'speechsplit/Decoder_3/Decoder_3/decode/self.linear_projection/BiasAdd:0' shape=(?, ?, 80) dtype=float32>)

In [22]:
_, _, _, f0_target = model_F0(X, f0_org, len_X)
f0_target

<tf.Tensor 'speechsplit_f0/Decoder_4/Decoder_4/decode/self.linear_projection/BiasAdd:0' shape=(?, ?, 257) dtype=float32>

In [23]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [24]:
o_ = sess.run(o, feed_dict = {
    X: mels, X_f0: f0s, len_X: mel_lens, V: vs
})

In [25]:
o_[0].shape, o_[1].shape, o_[2].shape, o_[3].shape, o_[4].shape

((2, 412, 8), (2, 412, 32), (2, 412, 1), (2, 412, 169), (2, 412, 80))

In [26]:
o = sess.run([f0_target], feed_dict = {
    X: mels, X_f0: f0s, len_X: mel_lens, V: vs
})
o[0].shape

(2, 412, 257)

In [None]:
saver = tf.train.Saver()
saver.save(sess, 'test/model.ckpt')

In [None]:
!ls -lh test
!rm -rf test