In [1]:
import os
import warnings

os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
import malaya_speech
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
import malaya_speech.augmentation.waveform as augmentation
from malaya_speech.train.model import unet
from malaya_speech.utils import tf_featurization
import malaya_speech.train as train
import random

In [3]:
from glob import glob

files = glob('../youtube/clean-wav/*.wav')
random.shuffle(files)
len(files)

15385

In [4]:
noises = glob('../noise-44k/noise/*.wav') + glob('../noise-44k/clean-wav/*.wav')
basses = glob('HHDS/Sources/**/*bass.wav', recursive = True)
drums = glob('HHDS/Sources/**/*drums.wav', recursive = True)
others = glob('HHDS/Sources/**/*other.wav', recursive = True)
noises = noises + basses + drums + others
random.shuffle(noises)
len(noises)

1472

In [5]:
def read_wav(f):
    return malaya_speech.load(f, sr = 44100)

def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = 44100, length = length)

In [6]:
def combine_speakers(files, n = 5):
    w_samples = random.sample(files, n)
    w_samples = [random_sampling(read_wav(f)[0], length = min(random.randint(20000 // n, 240000 // n), 150000)) \
                 for f in w_samples]
    y = [w_samples[0]]
    left = w_samples[0].copy() * random.uniform(0.5, 1.0)
    for i in range(1, n):

        right = w_samples[i].copy() * random.uniform(0.5, 1.0)

        overlap = random.uniform(0.01, 1.25)
        left_len = int(overlap * len(left))

        padded_right = np.pad(right, (left_len, 0))

        if len(left) > len(padded_right):
            padded_right = np.pad(
                padded_right, (0, len(left) - len(padded_right))
            )
        else:
            left = np.pad(left, (0, len(padded_right) - len(left)))

        y.append(padded_right)
        left = left + padded_right
    return left, y

In [7]:
y = combine_speakers(files, 5)[0]
n = combine_speakers(noises, 2)[0]
# ipd.Audio(y, rate = 44100)

In [8]:
def calc(signal, seed, add_uniform = False):
    random.seed(seed)

    choice = random.randint(0, 4)
    if choice == 0:

        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 50),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 1,
        )
    if choice == 1:
        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 70),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 0,
        )
    if choice == 2:
        x = augmentation.sox_augment_low(
            signal,
            min_bass_gain = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = random.randint(0, 1),
        )
    if choice == 3:
        x = augmentation.sox_augment_combine(
            signal,
            min_bass_gain_high = random.randint(25, 70),
            min_bass_gain_low = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 90),
        )
    if choice == 4:
        x = augmentation.sox_reverb(
            signal,
            reverberance = random.randint(10, 80),
            hf_damping = 10,
            room_scale = random.randint(10, 90),
        )

    if random.random() > 0.7 and add_uniform:
        x = augmentation.add_uniform_noise(
            x, power = random.uniform(0.005, 0.015)
        )

    return x

In [9]:
y = calc(y, 10)
n = calc(n, 10, True)

In [10]:
combined, noise = augmentation.add_noise(y, n, factor = random.uniform(0.1, 0.7), return_noise = True)

In [11]:
# ipd.Audio(combined[:44100 * 10], rate = 44100)

In [12]:
# ipd.Audio(y[:44100 * 10], rate = 44100)

In [13]:
# ipd.Audio(noise[:44100 * 10], rate = 44100)

In [20]:
import mp

def parallel(f):
    if random.random() > 0.7:
        s = random.sample(files, random.randint(2, 6))
        y = combine_speakers(s, len(s))[0]
    else:
        y = random_sampling(read_wav(f)[0], length = random.randint(20000, 150000))

    n = combine_speakers(noises, random.randint(1, 20))[0]
    seed = random.randint(0, 100000000)
    y = calc(y, seed)
    n = calc(n, seed, True)
    combined, noise = augmentation.add_noise(y, n, factor = random.uniform(0.1, 0.7), return_noise = True)
    return combined, y, noise

def loop(files):
    files = files[0]
    results = []
    for f in files:
        results.append(parallel(f))
    return results

def generate(batch_size = 10, repeat = 2):
    while True:
        random.shuffle(files)
        random.shuffle(noises)
        for i in range(0, len(files), batch_size):
            fs = files[i: i + batch_size]
            results = mp.multiprocessing(fs, loop, cores = len(fs))
            for r in results:
                for _ in range(repeat):
                    yield {'combined': r[0],
                           'y': r[1],
                           'noise': r[2]}

In [15]:
# g = generate()
# next(g)

In [16]:
def get_dataset():
    def get():
        dataset = tf.data.Dataset.from_generator(
            generate,
            {'combined': tf.float32, 'y': tf.float32, 'noise': tf.float32},
            output_shapes = {
                'combined': tf.TensorShape([None]),
                'y': tf.TensorShape([None]),
                'noise': tf.TensorShape([None]),
            },
        )
        return dataset
    return get

In [17]:
class Model:
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
        stft_X, D_X = tf_featurization.get_stft(self.X)
        
        self.stft = []
        for i in range(len(self.Y)):
            self.stft.append(tf_featurization.get_stft(self.Y[i]))
        
        self.outputs = []
        for i in range(len(self.Y)):
            with tf.variable_scope(f'model_{i}'):
                self.outputs.append(unet.model.apply_unet(D_X))
        
        self.loss = []
        for i in range(len(self.Y)):
            self.loss.append(tf.reduce_mean(tf.abs(self.outputs[i] - self.stft[i][1])))        
        
        self.cost = tf.reduce_sum(self.loss)

In [18]:
def model_fn(features, labels, mode, params):
    model = Model(features['combined'], [features['y'], features['noise']])
    loss = model.cost
    
    tf.identity(loss, 'total_loss')
    tf.summary.scalar('total_loss', loss)
    for i in range(len(model.loss)):
        tf.identity(model.loss[i], f'loss_{i}')
        tf.summary.scalar(f'loss_{i}', model.loss[i])
        
    global_step = tf.train.get_or_create_global_step()
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        
        optimizer = tf.train.AdamOptimizer(learning_rate = 1e-4)

        train_op = optimizer.minimize(loss, global_step = global_step)
        estimator_spec = tf.estimator.EstimatorSpec(
            mode = mode, loss = loss, train_op = train_op
        )

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode = tf.estimator.ModeKeys.EVAL, loss = loss
        )

    return estimator_spec

In [None]:
train_hooks = [
    tf.train.LoggingTensorHook(
        ['total_loss', 'loss_0', 'loss_1'], every_n_iter = 1
    )
]
train_dataset = get_dataset()

save_directory = 'noise-reduction-unet'

train.run_training(
    train_fn = train_dataset,
    model_fn = model_fn,
    model_dir = save_directory,
    num_gpus = 2,
    log_step = 1,
    save_checkpoint_step = 1000,
    max_steps = 100000,
    train_hooks = train_hooks,
    eval_step = 0,
)

INFO:tensorflow:Initializing RunConfig with distribution strategies.
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Using config: {'_model_dir': 'noise-reduction-unet', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 1, '_train_distribute': <tensorflow.contrib.distribute.python.mirrored_strategy.MirroredStrategy object at 0x7f07ac4d09b0>, '_device_fn': None, '_protocol': None, '_eval_distribute': <tensorflow.contrib.distribute.python.mirrored_strategy.MirroredStrategy object at 0x7f07ac4d09b0>, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.