In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import malaya_speech
import tensorflow as tf
from malaya_speech.train.model import fastsplit, fastspeech
import malaya_speech.augmentation.waveform as augmentation
import numpy as np






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
from glob import glob

wavs = glob('../speech/example-speaker/*.wav')
len(wavs)

8

In [5]:
import random

sr = 22050
speakers_size = 4

# noise = malaya_speech.load('noise.wav', sr = sr)[0]

def read_wav(f):
    return malaya_speech.load(f, sr = sr)

def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = sr, length = length)

def add_padding(sample, pad, pad_value):
    if pad[0]:
        if pad[0] > len(pad_value):
            left_pad = np.tile(pad_value, int(np.ceil(pad[0] / len(pad_value))))
        else:
            left_pad = pad_value[np.random.randint(0, len(pad_value) - pad[0] + 1) :]
        left_pad = left_pad[ :pad[0]]
    else:
        left_pad = 0
    
    if pad[1]:
        if pad[1] > len(pad_value):
            right_pad = np.tile(pad_value, int(np.ceil(pad[1] / len(pad_value))))
        else:
            right_pad = pad_value[np.random.randint(0, len(pad_value) - pad[1] + 1) :]
        right_pad = right_pad[ :pad[1]]
    else:
        right_pad = 0
        
    return np.pad(sample, pad, constant_values = (left_pad, right_pad))   

# def combine_speakers(files, n = 5, limit = 4):
#     w_samples = random.sample(files, n)
#     w_samples = [
#         random_sampling(
#             read_wav(f)[0],
#             length = random.randint(1500, max(10000 // n, 6000)),
#         )
#         for f in w_samples
#     ]
#     y = [w_samples[0]]
#     left = w_samples[0].copy()

#     combined = None

#     for i in range(1, n):
#         right = w_samples[i].copy()
#         overlap = random.uniform(0.1, 0.8)
#         print(i, overlap)
#         len_overlap = int(overlap * len(right))
#         minus = len(left) - len_overlap
#         if minus < 0:
#             minus = 0
#         padded_right = np.pad(right, (minus, 0))
#         padded_right_noise = add_padding(right, (minus, 0), noise)
#         left = np.pad(left, (0, len(padded_right) - len(left)))
#         left = left + padded_right

#         if i >= (limit - 1):
#             if combined is None:
#                 combined = padded_right_noise
#             else:
#                 combined = np.pad(
#                     combined, (0, len(padded_right) - len(combined))
#                 )
#                 combined += padded_right

#         else:
#             print(len(padded_right_noise))
#             y.append(padded_right_noise)

#     if combined is not None:
#         print(len(combined))
#         y.append(combined)

#     for i in range(len(y)):
#         if len(y[i]) != len(left):
#             y[i] = add_padding(y[i], (0, len(left) - len(y[i])), noise)
#             y[i] = y[i] / np.max(np.abs(y[i]))

#     left = left / np.max(np.abs(left))
#     return left, y

def combine_speakers(files, n = 5, limit = 4):
    w_samples = random.sample(files, n)
    w_samples = [
        random_sampling(
            read_wav(f)[0],
            length = min(
                random.randint(10000 // n, 20000 // n), 10000
            ),
        )
        for f in w_samples
    ]
    y = [w_samples[0]]
    left = w_samples[0].copy() * random.uniform(0.5, 1.0)
    start, end = [], []
    start.append(0)
    end.append(len(left))

    combined = None

    for i in range(1, n):
        right = w_samples[i].copy() * random.uniform(0.5, 1.0)
        overlap = random.uniform(0.1, 0.9)
        print(i, overlap, len(right))
        len_overlap = int(overlap * len(right))
        minus = len(left) - len_overlap
        if minus < 0:
            minus = 0
        
        padded_right = np.pad(right, (minus, 0))
        start.append(minus)
        end.append(len(padded_right))
        left = np.pad(left, (0, len(padded_right) - len(left)))

        left = left + padded_right

        if i >= (limit - 1):
            if combined is None:
                combined = padded_right
            else:
                combined = np.pad(
                    combined, (0, len(padded_right) - len(combined))
                )
                combined += padded_right

        else:
            y.append(padded_right)

    if combined is not None:
        y.append(combined)

    for i in range(len(y)):
        if len(y[i]) != len(left):
            y[i] = np.pad(y[i], (0, len(left) - len(y[i])), constant_values = 0.0005)
            y[i] = y[i] / np.max(np.abs(y[i]))

    left = left / np.max(np.abs(left))
    
    while len(y) < limit:
        y.append(np.zeros((len(left))))
        start.append(0)
        end.append(0)
        
    return left, y

# y, _ = malaya_speech.load('../speech/example-speaker/husein-zolkepli.wav')
# y = np.expand_dims(y, 0).astype(np.float32)
# y.shape

In [6]:
left, y = combine_speakers(wavs, 6)
len(left) / sr, len(y)

1 0.611060484845585 59466
2 0.28415860403694804 71170
3 0.5442890554839764 59290
4 0.7123869307136937 63272
5 0.5820755067972073 39842


(9.191428571428572, 4)

In [7]:
left_mel = malaya_speech.featurization.universal_mel(left)
y_mel = [malaya_speech.featurization.universal_mel(i) for i in y]

In [8]:
config = malaya_speech.config.fastspeech_config
dim = 192
config['encoder_hidden_size'] = dim * speakers_size
config['decoder_hidden_size'] = dim * speakers_size
config = fastspeech.Config(vocab_size = 1, **config)

In [9]:
model = fastsplit.Model(config, O = dim, C = speakers_size)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [10]:
X = tf.placeholder(tf.float32, [None, None, 80])
Y = tf.placeholder(tf.float32, [None, speakers_size, None, 80])
lengths = tf.placeholder(tf.int32, [None])
outputs = model(X, lengths)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [11]:
loss = fastsplit.calculate_loss(Y, outputs, lengths, C = speakers_size)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [12]:
loss

<tf.Tensor 'Mean:0' shape=() dtype=float32>

In [13]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [14]:
%%time

o = sess.run(outputs, feed_dict = {X: [left_mel], lengths: [len(left_mel)]})

CPU times: user 2.61 s, sys: 136 ms, total: 2.74 s
Wall time: 705 ms


In [15]:
sess.run(loss, feed_dict = {X: [left_mel], Y: [y_mel], lengths: [len(left_mel)]})

5.0078783

In [16]:
saver = tf.train.Saver()

In [17]:
saver.save(sess, 'test/model.ckpt')

'test/model.ckpt'

In [18]:
!ls -lh test

total 432936
-rw-r--r--  1 huseinzolkepli  staff    77B Mar  7 00:12 checkpoint
-rw-r--r--  1 huseinzolkepli  staff   181M Mar  7 00:12 model.ckpt.data-00000-of-00001
-rw-r--r--  1 huseinzolkepli  staff   5.6K Mar  7 00:12 model.ckpt.index
-rw-r--r--  1 huseinzolkepli  staff    25M Mar  7 00:12 model.ckpt.meta


In [19]:
!rm -rf test