In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
# !pip3 install pysptk

In [4]:
import malaya_speech
from pysptk import sptk
import numpy as np

In [5]:
x = np.random.normal(size = (10, 100, 2))
x.shape

(10, 100, 2)

In [6]:
import tensorflow as tf

# tf.compat.v1.enable_eager_execution()

In [7]:
vggvox_v2 = malaya_speech.gender.deep_model(model = 'vggvox-v2')
speaker_model = malaya_speech.speaker_vector.deep_model('vggvox-v2')

In [8]:
freqs = {'female': [100, 600], 'male': [50, 250]}

In [9]:
from scipy.signal import get_window
from scipy import signal
import soundfile as sf
import random

sr = 22050

def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

b, a = butter_highpass(30, sr, order=5)

In [10]:
from math import ceil

def speaker_normalization(f0, index_nonzero, mean_f0, std_f0):
    f0 = f0.astype(float).copy()
    f0[index_nonzero] = (f0[index_nonzero] - mean_f0) / std_f0
    f0[index_nonzero] = np.clip(f0[index_nonzero], -3, 4)
    return f0

def preprocess_wav(x):
    if x.shape[0] % 256 == 0:
        x = np.concatenate((x, np.array([1e-06])), axis=0)
    y = signal.filtfilt(b, a, x)
    wav = y * 0.96 + (np.random.uniform(size = y.shape[0]) - 0.5)*1e-06
    return wav

def get_f0(wav, lo, hi):
    f0_rapt = sptk.rapt(wav.astype(np.float32)*32768, sr, 256, min=lo, max=hi, otype=2)
    index_nonzero = (f0_rapt != -1e10)
    mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(f0_rapt[index_nonzero])
    return speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0)

def pad_seq(x, base = 8):
    len_out = int(base * ceil(float(x.shape[0]) / base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), x.shape[0]

def get_speech(f, hop_size = 256):
    x, fs = malaya_speech.load(f, sr = sr)
    wav = preprocess_wav(x)
    lo, hi = freqs.get(vggvox_v2(x), [50, 250])
    f0 = np.expand_dims(get_f0(wav, lo, hi), -1)
    mel = malaya_speech.featurization.universal_mel(wav)
    
    batch_max_steps = random.randint(16384, 77175)
    batch_max_frames = batch_max_steps // hop_size
    
    if len(mel) > batch_max_frames:
        interval_start = 0
        interval_end = len(mel) - batch_max_frames
        start_frame = random.randint(interval_start, interval_end)
        start_step = start_frame * hop_size
        wav = wav[start_step : start_step + batch_max_steps]
        mel = mel[start_frame : start_frame + batch_max_frames, :]
        f0 = f0[start_frame : start_frame + batch_max_frames, :]
    
    mel, _ = pad_seq(mel)
    f0, _ = pad_seq(f0) 
        
    v = speaker_model([wav])[0]
    v = v / v.max()
    return wav, mel, f0, v

In [11]:
wav, mel, f0, v = get_speech('../speech/example-speaker/female.wav')

In [12]:
wav_1, mel_1, f0_1, v_1 = get_speech('../speech/example-speaker/khalil-nooh.wav')

In [13]:
mels, mel_lens = malaya_speech.padding.sequence_nd([mel, mel_1], dim = 0, return_len = True)
mels.shape, mel_lens

((2, 104, 80), [80, 104])

In [14]:
f0s, f0_lens = malaya_speech.padding.sequence_nd([f0, f0_1], dim = 0, return_len = True)
f0s.shape, f0_lens

((2, 104, 1), [80, 104])

In [15]:
vs = malaya_speech.padding.sequence_nd([v, v_1], dim = 0)
vs.shape

(2, 512)

In [16]:
f0.shape

(80, 1)

In [17]:
X = tf.placeholder(tf.float32, [None, None, 80])
X_f0 = tf.placeholder(tf.float32, [None, None, 1])
len_X = tf.placeholder(tf.int32, [None])
V = tf.placeholder(tf.float32, [None, 512])

# X = tf.convert_to_tensor(mels.astype(np.float32))
# X_f0 = tf.convert_to_tensor(f0s.astype(np.float32))
# len_X = tf.convert_to_tensor(mel_lens)
# V = tf.convert_to_tensor(vs.astype(np.float32))

In [20]:
from malaya_speech.train.model.speechsplit import inference
from malaya_speech.train.model import speechsplit

In [21]:
hparams = speechsplit.hparams

interplnr = speechsplit.InterpLnr(hparams)
model = inference.Model(hparams)
model_F0 = inference.Model_F0(hparams)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [22]:
bottleneck_speaker = tf.keras.layers.Dense(hparams.dim_spk_emb)
speaker_dim = bottleneck_speaker(V)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [23]:
x_f0_intrp = interplnr(tf.concat([X, X_f0], axis = -1), len_X)
x_f0_intrp.shape

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


TensorShape([Dimension(None), Dimension(None), Dimension(81)])

In [24]:
f0_org_intrp = speechsplit.quantize_f0_tf(x_f0_intrp[:,:,-1])
x_f0_intrp_org = tf.concat((x_f0_intrp[:,:,:-1], f0_org_intrp), axis=-1)

In [25]:
f0_org = speechsplit.quantize_f0_tf(X_f0[:,:,0])

In [26]:
f0_org_intrp, x_f0_intrp_org, X, speaker_dim, f0_org

(<tf.Tensor 'Reshape_1:0' shape=(?, ?, 257) dtype=float32>,
 <tf.Tensor 'concat_1:0' shape=(?, ?, 337) dtype=float32>,
 <tf.Tensor 'Placeholder:0' shape=(?, ?, 80) dtype=float32>,
 <tf.Tensor 'dense_2/BiasAdd:0' shape=(?, 128) dtype=float32>,
 <tf.Tensor 'Reshape_3:0' shape=(?, ?, 257) dtype=float32>)

In [27]:
codes_x, codes_f0, codes_2, encoder_outputs, mel_outputs = model(x_f0_intrp_org, X, speaker_dim)
codes_x.shape, codes_f0.shape, codes_2.shape, encoder_outputs.shape, mel_outputs.shape

Tensor("speechsplit/Encoder_7/strided_slice:0", shape=(?, ?, 80), dtype=float32) Tensor("speechsplit/Encoder_7/strided_slice_1:0", shape=(?, ?, 257), dtype=float32)
Tensor("speechsplit/Encoder_7/strided_slice_4:0", shape=(?, ?, 512), dtype=float32) Tensor("speechsplit/Encoder_7/strided_slice_5:0", shape=(?, ?, 256), dtype=float32)
Tensor("speechsplit/Encoder_7/strided_slice_8:0", shape=(?, ?, 512), dtype=float32) Tensor("speechsplit/Encoder_7/strided_slice_9:0", shape=(?, ?, 256), dtype=float32)
(?, ?, 2)


(TensorShape([Dimension(None), Dimension(None), Dimension(16)]),
 TensorShape([Dimension(None), Dimension(None), Dimension(64)]),
 TensorShape([Dimension(None), Dimension(None), Dimension(2)]),
 TensorShape([Dimension(None), Dimension(None), Dimension(210)]),
 TensorShape([Dimension(None), Dimension(None), Dimension(80)]))

In [28]:
_, _, _, f0_target = model_F0(X, f0_org)
f0_target

(?, ?, 2)


<tf.Tensor 'speechsplit_f0/Decoder_4/LinearNorm/dense_1/BiasAdd:0' shape=(?, ?, 257) dtype=float32>

In [29]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [30]:
o = sess.run([x_f0_intrp], feed_dict = {
    X: mels, X_f0: f0s, len_X: mel_lens, V: vs
})

In [None]:
o = sess.run([codes_x, codes_f0, codes_2, encoder_outputs, mel_outputs], feed_dict = {
    X: mels, X_f0: f0s, len_X: mel_lens, V: vs
})

In [None]:
o = sess.run([f0_target], feed_dict = {
    X: mels, X_f0: f0s, len_X: mel_lens, V: vs
})
o[0].shape

In [None]:
tf.trainable_variables()

In [None]:
saver = tf.train.Saver()

In [None]:
saver.save(sess, 'test/model.ckpt')

In [None]:
!ls -lh test
!rm -rf test