In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import tensorflow as tf
import malaya_speech.augmentation.waveform as augmentation
from malaya_speech.train.model import demucs, stft
import malaya_speech
from glob import glob
import random
import numpy as np
import IPython.display as ipd

np.seterr(all='raise')






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
files = glob('../youtube/clean-wav/*.wav')
random.shuffle(files)
len(files)

noises = glob('../noise-44k/noise/*.wav') + glob('../noise-44k/clean-wav/*.wav')
basses = glob('HHDS/Sources/**/*bass.wav', recursive = True)
drums = glob('HHDS/Sources/**/*drums.wav', recursive = True)
others = glob('HHDS/Sources/**/*other.wav', recursive = True)
noises = noises + basses + drums + others
random.shuffle(noises)

In [4]:
def read_wav(f):
    return malaya_speech.load(f, sr = 44100)


def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = 44100, length = length)


def combine_speakers(files, n = 5):
    w_samples = random.sample(files, n)
    w_samples = [
        random_sampling(
            read_wav(f)[0],
            length = min(
                random.randint(20000 // n, 240_000 // n), 100_000 // n
            ),
        )
        for f in w_samples
    ]
    y = [w_samples[0]]
    left = w_samples[0].copy() * random.uniform(0.5, 1.0)
    for i in range(1, n):

        right = w_samples[i].copy() * random.uniform(0.5, 1.0)

        overlap = random.uniform(0.01, 1.25)
        left_len = int(overlap * len(left))

        padded_right = np.pad(right, (left_len, 0))

        if len(left) > len(padded_right):
            padded_right = np.pad(
                padded_right, (0, len(left) - len(padded_right))
            )
        else:
            left = np.pad(left, (0, len(padded_right) - len(left)))

        y.append(padded_right)
        left = left + padded_right
    return left, y

def random_amplitude(sample, low = 3, high = 5):
    y_aug = sample.copy()
    dyn_change = np.random.uniform(low = low, high = high)
    y_aug = y_aug * dyn_change
    return np.clip(y_aug, -1, 1)

def random_amplitude_threshold(sample, low = 1, high = 2, threshold = 0.4):
    y_aug = sample.copy()
    y_aug = y_aug / (np.max(np.abs(y_aug)) + 1e-9)
    dyn_change = np.random.uniform(low = low, high = high)
    y_aug[np.abs(y_aug) >= threshold] = (
        y_aug[np.abs(y_aug) >= threshold] * dyn_change
    )
    return np.clip(y_aug, -1, 1)


def calc(signal, seed, add_uniform = False):
    random.seed(seed)

    choice = random.randint(0, 6)
    print('choice', choice)
    if choice == 0:

        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 50),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 1,
        )
    if choice == 1:
        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 70),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 0,
        )
    if choice == 2:
        x = augmentation.sox_augment_low(
            signal,
            min_bass_gain = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = random.randint(0, 1),
        )
    if choice == 3:
        x = augmentation.sox_augment_combine(
            signal,
            min_bass_gain_high = random.randint(25, 70),
            min_bass_gain_low = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 90),
        )
    if choice == 4:
        x = augmentation.sox_reverb(
            signal,
            reverberance = random.randint(10, 80),
            hf_damping = 10,
            room_scale = random.randint(10, 90),
        )
    if choice == 5:
        x = random_amplitude_threshold(
            signal, threshold = random.uniform(0.35, 0.8)
        )

    if choice == 6:
        x = signal

    if choice not in [5] and random.gauss(0.5, 0.14) >= 0.6:
        x = random_amplitude_threshold(
            x, low = 1.0, high = 2.0, threshold = random.uniform(0.6, 0.9)
        )
    
    if random.gauss(0.5, 0.14) > 0.6 and add_uniform:
        x = augmentation.add_uniform_noise(
            x, power = random.uniform(0.005, 0.015)
        )

    return x

In [5]:
def parallel(f):
    if random.gauss(0.5, 0.14) > 0.6:
        s = random.sample(files, random.randint(2, 6))
        y = combine_speakers(s, len(s))[0]
    else:
        y = random_sampling(
            read_wav(f)[0], length = random.randint(30000, 100_000)
        )
        
    y = y / (np.max(np.abs(y)) + 1e-9)

    seed = random.randint(0, 100_000_000)
    x = calc(y, seed)
    if random.gauss(0.5, 0.14) > 0.6:
        print('add small noise')
        n = combine_speakers(noises, random.randint(1, 20))[0]
        n = calc(n, seed, True)
        combined, noise = augmentation.add_noise(
            x, n, factor = random.uniform(0.01, 0.1), return_noise = True
        )
    else:
        x = x / (np.max(np.abs(x)) + 1e-9)
        combined = x
    noise = combined - y
    return combined, y, noise

In [6]:
sr = 44100
combined, actual, noise = parallel(files[0])

choice 2


In [7]:
combined = combined[: 20 * sr]
actual = actual[: 20 * sr]

In [8]:
i = tf.placeholder(tf.float32, [None, 1])
y = tf.placeholder(tf.float32, [None, 1])
model = demucs.Model(i, sources = 1, logging = True)
logits = model.logits[:tf.shape(i)[0]]

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Tensor("Pad_1:0", shape=(?, 99668, 1), dtype=float32)
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Tensor("sequential/conv1d_1/mul:0", shape=(?, 24916, 64), dtype=float32)
Tensor("sequential_2/conv1d_4/mul:0", shape=(?, 6228, 128), dtype=float32)
Tensor("sequential_4/conv1d_7/mul:0", shape=(?, 1556, 256), dtype=float32)
Tensor("sequential_6/conv1d_10/mul:0", shape=(?, 388, 512), dtype=float32)
Tensor("sequential_8/conv1d_13/mul:0", shape=(?, 96, 1024), dtype=float32)
x 

In [9]:
logits

<tf.Tensor 'strided_slice_21:0' shape=(?, 1) dtype=float32>

In [10]:
stft_loss = stft.loss.TFMultiResolutionSTFT(factor_sc = 0.5, factor_mag = 0.5)
l1 = tf.reduce_mean(tf.abs(y - logits))
sc_loss, mag_loss = stft_loss(tf.expand_dims(y[:,0], 0), tf.expand_dims(logits[:,0], 0))
loss = l1 + sc_loss + mag_loss
optimizer = tf.train.AdamOptimizer(learning_rate = 3e-4).minimize(loss)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [12]:
len(combined) / 44100

20.0

In [13]:
# actual = (actual - np.mean(actual)) / np.std(actual)
# combined = (combined - np.mean(combined)) / np.std(combined)

In [14]:
for e in range(2000):
    l, _ = sess.run([loss, optimizer], feed_dict = {i: np.expand_dims(combined, axis = -1),
                                                   y: np.expand_dims(actual, axis = -1)})
    if e % 10 == 0:
        print(e, l)

0 [2.1996558]
10 [2.0349698]
20 [1.6942861]
30 [1.3566798]
40 [1.1645055]
50 [1.1557816]
60 [1.135972]
70 [1.1181241]
80 [1.1112022]
90 [1.1117486]
100 [1.0892215]
110 [1.0730977]
120 [1.0569546]
130 [1.0512955]
140 [1.0384164]
150 [1.0257053]
160 [1.0196753]
170 [1.0246775]
180 [0.9791092]
190 [0.9333024]
200 [0.9217137]
210 [0.912246]
220 [0.9475024]
230 [0.93762726]
240 [0.93010426]
250 [0.9093938]
260 [0.91559064]
270 [0.87311506]
280 [0.9027535]
290 [0.85577804]
300 [0.8872821]
310 [0.87856126]
320 [0.8306566]
330 [0.8358633]
340 [0.86777997]
350 [0.85964155]
360 [0.8497513]
370 [0.827289]
380 [0.7828235]
390 [0.821355]
400 [0.8642683]
410 [0.81861675]
420 [0.81613684]
430 [0.8003118]
440 [0.7717555]
450 [0.7919587]
460 [0.7993283]
470 [0.74340725]
480 [0.7887602]
490 [0.740398]
500 [0.74050856]
510 [0.7100315]
520 [0.7132337]
530 [0.68836254]
540 [0.75272936]
550 [0.710472]
560 [0.7588028]
570 [0.70148206]
580 [0.6869647]
590 [0.66911536]
600 [0.70760703]
610 [0.6734929]
620 [0.6

In [15]:
y_ = sess.run(logits, feed_dict = {i: np.expand_dims(combined, axis = -1)})

In [19]:
import IPython.display as ipd

ipd.Audio(actual[:sr*5], rate = sr)

In [20]:
ipd.Audio(y_[:,0][:sr*5], rate = sr)