In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
# !pip3 install pysptk

In [4]:
import malaya_speech
from pysptk import sptk
import numpy as np

In [5]:
import tensorflow as tf

# tf.compat.v1.enable_eager_execution()

In [6]:
vggvox_v2 = malaya_speech.gender.deep_model(model = 'vggvox-v2')
speaker_model = malaya_speech.speaker_vector.deep_model('vggvox-v2')

In [7]:
freqs = {'female': [100, 600], 'male': [50, 250]}

In [8]:
from scipy.signal import get_window
from scipy import signal
import soundfile as sf
sr = 22050

def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

b, a = butter_highpass(30, sr, order=5)

In [9]:
def speaker_normalization(f0, index_nonzero, mean_f0, std_f0):
    f0 = f0.astype(float).copy()
    f0[index_nonzero] = (f0[index_nonzero] - mean_f0) / std_f0
    f0[index_nonzero] = np.clip(f0[index_nonzero], -3, 4)
    return f0

def preprocess_wav(x):
    if x.shape[0] % 256 == 0:
        x = np.concatenate((x, np.array([1e-06])), axis=0)
    y = signal.filtfilt(b, a, x)
    wav = y * 0.96 + (np.random.uniform(size = y.shape[0]) - 0.5)*1e-06
    return wav

def get_f0(wav, lo, hi):
    f0_rapt = sptk.rapt(wav.astype(np.float32)*32768, sr, 256, min=lo, max=hi, otype=2)
    index_nonzero = (f0_rapt != -1e10)
    mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(f0_rapt[index_nonzero])
    return speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)

def get_speech(f):
    x, fs = sf.read(f)
    wav = preprocess_wav(x)
    lo, hi = freqs.get(vggvox_v2(x), [50, 250])
    print(lo, hi)
    f0 = np.expand_dims(get_f0(wav, lo, hi), -1)
    mel = malaya_speech.featurization.universal_mel(wav)
    v = speaker_model([wav])[0]
    v = v / v.max()
    return wav, mel[:24 * 8], f0[:24 * 8], v

In [10]:
wav, mel, f0, v = get_speech('../speech/example-speaker/female.wav')

100 600


In [11]:
wav_1, mel_1, f0_1, v_1 = get_speech('../speech/example-speaker/khalil-nooh.wav')

50 250


In [12]:
mels, mel_lens = malaya_speech.padding.sequence_nd([mel, mel_1], dim = 0, return_len = True)
mels.shape, mel_lens

((2, 192, 80), [192, 192])

In [13]:
f0s, f0_lens = malaya_speech.padding.sequence_nd([f0, f0_1], dim = 0, return_len = True)
f0s.shape, f0_lens

((2, 192, 1), [192, 192])

In [14]:
vs = malaya_speech.padding.sequence_nd([v, v_1], dim = 0)
vs.shape

(2, 512)

In [15]:
X = tf.placeholder(tf.float32, [None, None, 80])
X_f0 = tf.placeholder(tf.float32, [None, None, 1])
len_X = tf.placeholder(tf.int32, [None])
V = tf.placeholder(tf.float32, [None, 512])

# X = tf.convert_to_tensor(mels.astype(np.float32))
# X_f0 = tf.convert_to_tensor(f0s.astype(np.float32))
# len_X = tf.convert_to_tensor(mel_lens)
# V = tf.convert_to_tensor(vs.astype(np.float32))

In [16]:
from malaya_speech.train.model import speechsplit






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [17]:
hparams = speechsplit.hparams

interplnr = speechsplit.InterpLnr(hparams)
model = speechsplit.Model(hparams)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [18]:
bottleneck_speaker = tf.keras.layers.Dense(hparams.dim_spk_emb)
speaker_dim = bottleneck_speaker(V)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [19]:
x_f0_intrp = interplnr(tf.concat([X, X_f0], axis = -1), len_X)
x_f0_intrp.shape

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


TensorShape([Dimension(None), Dimension(None), Dimension(81)])

In [20]:
f0_org_intrp = speechsplit.quantize_f0_tf(x_f0_intrp[:,:,-1])
x_f0_intrp_org = tf.concat((x_f0_intrp[:,:,:-1], f0_org_intrp), axis=-1)

In [21]:
x_f0_intrp_org, X, speaker_dim

(<tf.Tensor 'concat_1:0' shape=(?, ?, 337) dtype=float32>,
 <tf.Tensor 'Placeholder:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'dense_1/BiasAdd:0' shape=(?, 128) dtype=float32>)

In [22]:
codes_x, codes_f0, codes_2, encoder_outputs, mel_outputs = model(x_f0_intrp_org, X, speaker_dim)
codes_x.shape, codes_f0.shape, codes_2.shape, encoder_outputs.shape, mel_outputs.shape

Tensor("speechsplit/Encoder_7/strided_slice:0", shape=(?, ?, 80), dtype=float32) Tensor("speechsplit/Encoder_7/strided_slice_1:0", shape=(?, ?, 257), dtype=float32)
Tensor("speechsplit/Encoder_7/strided_slice_3:0", shape=(?, ?, 512), dtype=float32) Tensor("speechsplit/Encoder_7/strided_slice_4:0", shape=(?, ?, 256), dtype=float32)
Tensor("speechsplit/Encoder_7/strided_slice_6:0", shape=(?, ?, 512), dtype=float32) Tensor("speechsplit/Encoder_7/strided_slice_7:0", shape=(?, ?, 256), dtype=float32)
(?, ?, 2)


(TensorShape([Dimension(None), Dimension(None), Dimension(16)]),
 TensorShape([Dimension(None), Dimension(None), Dimension(64)]),
 TensorShape([Dimension(None), Dimension(None), Dimension(2)]),
 TensorShape([Dimension(None), Dimension(None), Dimension(210)]),
 TensorShape([Dimension(None), Dimension(None), Dimension(80)]))

In [24]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [25]:
o = sess.run([codes_x, codes_f0, codes_2, encoder_outputs, mel_outputs], feed_dict = {
    X: mels, X_f0: f0s, len_X: mel_lens, V: vs
})

In [26]:
o

[array([[[-8.19223166e-01, -5.76501966e-01,  1.66250005e-01,
           2.92422056e-01, -1.48027539e-01,  8.34660828e-02,
           2.52451003e-01,  4.05756563e-01,  3.60189795e-01,
          -5.68357646e-01, -2.07107127e-01, -1.84687063e-01,
           4.12933022e-01, -5.76610267e-01,  8.73266906e-02,
          -3.61911990e-02],
         [-9.59383965e-01, -5.24995983e-01,  1.83395848e-01,
           2.69266605e-01, -3.33840787e-01,  3.66560549e-01,
           2.13176370e-01,  4.08696413e-01,  2.72756964e-01,
          -6.51961684e-01,  3.78961978e-03, -3.34770620e-01,
           2.24637538e-01, -6.40670657e-01, -7.67343640e-02,
          -3.50477934e-01],
         [-9.99995947e-01, -4.95920509e-01,  2.60428011e-01,
           3.63443077e-01, -2.81184524e-01,  3.51485938e-01,
           2.41491243e-01,  4.64207351e-01,  2.61643887e-01,
          -6.22841716e-01, -3.50799710e-01, -2.76235133e-01,
           2.93098390e-01, -5.66881537e-01, -2.43351147e-01,
          -4.73379403e-01],
 

In [27]:
tf.trainable_variables()

[<tf.Variable 'dense_1/kernel:0' shape=(512, 128) dtype=float32>,
 <tf.Variable 'dense_1/bias:0' shape=(128,) dtype=float32>,
 <tf.Variable 'speechsplit/Encoder_7/sequential/ConvNorm/conv1d/kernel:0' shape=(5, 80, 512) dtype=float32>,
 <tf.Variable 'speechsplit/Encoder_7/sequential/ConvNorm/conv1d/bias:0' shape=(512,) dtype=float32>,
 <tf.Variable 'speechsplit/Encoder_7/sequential/group_normalization/gamma:0' shape=(512,) dtype=float32>,
 <tf.Variable 'speechsplit/Encoder_7/sequential/group_normalization/beta:0' shape=(512,) dtype=float32>,
 <tf.Variable 'speechsplit/Encoder_7/sequential_1/ConvNorm/conv1d_1/kernel:0' shape=(5, 257, 256) dtype=float32>,
 <tf.Variable 'speechsplit/Encoder_7/sequential_1/ConvNorm/conv1d_1/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'speechsplit/Encoder_7/sequential_1/group_normalization_1/gamma:0' shape=(256,) dtype=float32>,
 <tf.Variable 'speechsplit/Encoder_7/sequential_1/group_normalization_1/beta:0' shape=(256,) dtype=float32>,
 <tf.Variable '

In [28]:
saver = tf.train.Saver()

In [29]:
saver.save(sess, 'test/model.ckpt')

'test/model.ckpt'

In [30]:
!ls -lh test

total 156864
-rw-r--r--  1 huseinzolkepli  staff    77B Mar 14 22:27 checkpoint
-rw-r--r--  1 huseinzolkepli  staff    75M Mar 14 22:27 model.ckpt.data-00000-of-00001
-rw-r--r--  1 huseinzolkepli  staff   3.2K Mar 14 22:27 model.ckpt.index
-rw-r--r--  1 huseinzolkepli  staff   1.5M Mar 14 22:27 model.ckpt.meta


In [31]:
!rm -rf test