In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import sys

SOURCE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
sys.path.insert(0, SOURCE_DIR)

In [3]:
import malaya_speech
import tensorflow as tf
from malaya_speech.train.model import swave
import malaya_speech.augmentation.waveform as augmentation
import numpy as np






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
from glob import glob

wavs = glob('../speech/example-speaker/*.wav')
len(wavs)

8

In [5]:
import random

sr = 8000
speakers_size = 4

def read_wav(f):
    return malaya_speech.load(f, sr = sr)

def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = sr, length = length)

def combine_speakers(files, n = 5, limit = 4):
    w_samples = random.sample(files, n)
    w_samples = [
        random_sampling(
            read_wav(f)[0],
            length = min(
                random.randint(10000 // n, 20000 // n), 10000
            ),
        )
        for f in w_samples
    ]
    y = [w_samples[0]]
    left = w_samples[0].copy() * random.uniform(0.5, 1.0)
    start, end = [], []
    start.append(0)
    end.append(len(left))

    combined = None

    for i in range(1, n):
        right = w_samples[i].copy() * random.uniform(0.5, 1.0)
        overlap = random.uniform(0.1, 0.9)
        print(i, overlap, len(right))
        len_overlap = int(overlap * len(right))
        minus = len(left) - len_overlap
        padded_right = np.pad(right, (minus, 0))
        start.append(minus)
        end.append(len(padded_right))
        left = np.pad(left, (0, len(padded_right) - len(left)))

        left = left + padded_right

        if i >= (limit - 1):
            if combined is None:
                combined = padded_right
            else:
                combined = np.pad(
                    combined, (0, len(padded_right) - len(combined))
                )
                combined += padded_right

        else:
            y.append(padded_right)

    if combined is not None:
        y.append(combined)

    for i in range(len(y)):
        if len(y[i]) != len(left):
            y[i] = np.pad(y[i], (0, len(left) - len(y[i])))
            y[i] = y[i] / np.max(np.abs(y[i]))

    left = left / np.max(np.abs(left))
    
    while len(y) < limit:
        y.append(np.zeros((len(left))))
        start.append(0)
        end.append(0)
        
    return left, y, start[:limit], end[:limit]

# y, _ = malaya_speech.load('../speech/example-speaker/husein-zolkepli.wav')
# y = np.expand_dims(y, 0).astype(np.float32)
# y.shape

In [6]:
left, y, start, end = combine_speakers(wavs, random.randint(1, len(wavs)))
len(left) / sr, len(y), start, end

1 0.11166437227593162 31436


(12.10075, 4, [0, 65370, 0, 0], [68880, 96806, 0, 0])

In [7]:
model = swave.Model(C = speakers_size, sample_rate = sr)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [8]:
X = tf.placeholder(tf.float32, [None, None])
Y = tf.placeholder(tf.float32, [None, speakers_size, None])
lengths = tf.placeholder(tf.int32, [None])
outputs, output_all = model(X)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use keras.layers.AveragePooling2D instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [9]:
outputs

[<tf.Tensor 'swave/cond/Merge:0' shape=(?, 4, ?) dtype=float32>,
 <tf.Tensor 'swave/cond_1/Merge:0' shape=(?, 4, ?) dtype=float32>,
 <tf.Tensor 'swave/cond_2/Merge:0' shape=(?, 4, ?) dtype=float32>,
 <tf.Tensor 'swave/cond_3/Merge:0' shape=(?, 4, ?) dtype=float32>,
 <tf.Tensor 'swave/cond_4/Merge:0' shape=(?, 4, ?) dtype=float32>,
 <tf.Tensor 'swave/cond_5/Merge:0' shape=(?, 4, ?) dtype=float32>]

In [10]:
loss = 0
for c_idx, est_src in enumerate(outputs):
    coeff = ((c_idx + 1) * (1 / len(outputs)))
    print(c_idx, est_src, coeff)
    sisnr_loss, snr, est_src = swave.calculate_loss(Y, est_src, lengths, C = speakers_size)
    loss += (coeff * sisnr_loss)
    
loss /= len(outputs)

0 Tensor("swave/cond/Merge:0", shape=(?, 4, ?), dtype=float32) 0.16666666666666666
Tensor("sub:0", shape=(?, 4, ?), dtype=float32) Tensor("sub_1:0", shape=(?, 4, ?), dtype=float32)

1 Tensor("swave/cond_1/Merge:0", shape=(?, 4, ?), dtype=float32) 0.3333333333333333
Tensor("sub_4:0", shape=(?, 4, ?), dtype=float32) Tensor("sub_5:0", shape=(?, 4, ?), dtype=float32)
2 Tensor("swave/cond_2/Merge:0", shape=(?, 4, ?), dtype=float32) 0.5
Tensor("sub_8:0", shape=(?, 4, ?), dtype=float32) Tensor("sub_9:0", shape=(?, 4, ?), dtype=float32)
3 Tensor("swave/cond_3/Merge:0", shape=(?, 4, ?), dtype=float32) 0.6666666666666666
Tensor("sub_12:0", shape=(?, 4, ?), dtype=float32) Tensor("sub_13:0", shape=(?, 4, ?), dtype=float32)
4 Tensor("swave/cond_4/Merge:0", shape=(?, 4, ?), dtype=float32) 0.8333333333333333
Tensor("sub_16:0", shape=(?, 4, ?), dtype=float32) Tensor("sub_17:0", shape=(?, 4, ?), dtype=float32)
5 Tensor("swave/cond_5/Merge:0", shape=(?, 4, ?), dtype=float32) 1.0
Tensor("sub_20:0", shape

In [11]:
est_src

<tf.Tensor 'swave/cond_5/Merge:0' shape=(?, 4, ?) dtype=float32>

In [12]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [13]:
%%time

o = sess.run(output_all, feed_dict = {X: [left, left]})

CPU times: user 1min 19s, sys: 26.5 s, total: 1min 45s
Wall time: 18.1 s


In [14]:
o = sess.run(outputs, feed_dict = {X: [left, left]})

In [15]:
[i.shape for i in o]

[(2, 4, 96806),
 (2, 4, 96806),
 (2, 4, 96806),
 (2, 4, 96806),
 (2, 4, 96806),
 (2, 4, 96806)]

In [21]:
sess.run(loss, feed_dict = {X: [left] * 3, Y: [y] * 3, lengths: [len(left)] * 3})

37.62079

In [17]:
saver = tf.train.Saver()

In [18]:
saver.save(sess, 'test/model.ckpt')

'test/model.ckpt'

In [19]:
!ls -lh test

total 71064
-rw-r--r--  1 huseinzolkepli  staff    77B Mar  1 22:50 checkpoint
-rw-r--r--  1 huseinzolkepli  staff    29M Mar  1 22:50 model.ckpt.data-00000-of-00001
-rw-r--r--  1 huseinzolkepli  staff    10K Mar  1 22:50 model.ckpt.index
-rw-r--r--  1 huseinzolkepli  staff   5.8M Mar  1 22:50 model.ckpt.meta


In [20]:
!rm -rf test