In [1]:
import tensorflow as tf
import malaya_speech
import malaya_speech.augmentation.waveform as augmentation
import malaya_speech.augmentation.spectrogram as mask_augmentation
import malaya_speech.train.model.ctc as ctc
import malaya_speech.train as train
import numpy as np






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




In [2]:
# !git pull

In [3]:
featurizer = malaya_speech.tf_featurization.STTFeaturizer(normalize_per_feature = True)
n_mels = featurizer.num_feature_bins

In [4]:
import random
from glob import glob

noises = glob('../noise-44k/noise/*.wav') + glob('../noise-44k/clean-wav/*.wav')
basses = glob('HHDS/Sources/**/*bass.wav', recursive = True)
drums = glob('HHDS/Sources/**/*drums.wav', recursive = True)
others = glob('HHDS/Sources/**/*other.wav', recursive = True)
noises = noises + basses + drums + others
random.shuffle(noises)

In [5]:
def read_wav(f):
    return malaya_speech.load(f, sr = 16000)

def random_sampling(s, length):
    return augmentation.random_sampling(s, sr = 16000, length = length)

def combine_speakers(files, n = 5):
    w_samples = random.sample(files, n)
    w_samples = [
        random_sampling(
            read_wav(f)[0],
            length = min(
                random.randint(20000 // n, 240_000 // n), 100_000 // n
            ),
        )
        for f in w_samples
    ]
    y = [w_samples[0]]
    left = w_samples[0].copy() * random.uniform(0.5, 1.0)
    for i in range(1, n):

        right = w_samples[i].copy() * random.uniform(0.5, 1.0)

        overlap = random.uniform(0.01, 1.25)
        left_len = int(overlap * len(left))

        padded_right = np.pad(right, (left_len, 0))

        if len(left) > len(padded_right):
            padded_right = np.pad(
                padded_right, (0, len(left) - len(padded_right))
            )
        else:
            left = np.pad(left, (0, len(padded_right) - len(left)))

        y.append(padded_right)
        left = left + padded_right
    return left, y

def random_amplitude_threshold(sample, low = 1, high = 2, threshold = 0.4):
    y_aug = sample.copy()
    y_aug = y_aug / (np.max(np.abs(y_aug)) + 1e-9)
    dyn_change = np.random.uniform(low = low, high = high)
    y_aug[np.abs(y_aug) >= threshold] = (
        y_aug[np.abs(y_aug) >= threshold] * dyn_change
    )
    return np.clip(y_aug, -1, 1)

def calc(signal, seed, add_uniform = False):
    random.seed(seed)

    choice = random.randint(0, 8)
    if choice == 0:

        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 50),
            reverberance = random.randint(0, 30),
            hf_damping = 10,
            room_scale = random.randint(0, 30),
            negate = 1,
        )
    if choice == 1:
        x = augmentation.sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 70),
            reverberance = random.randint(0, 30),
            hf_damping = 10,
            room_scale = random.randint(0, 30),
            negate = 0,
        )
    if choice == 2:
        x = augmentation.sox_augment_low(
            signal,
            min_bass_gain = random.randint(5, 30),
            reverberance = random.randint(0, 30),
            hf_damping = 10,
            room_scale = random.randint(0, 30),
            negate = random.randint(0, 1),
        )
    if choice == 3:
        x = augmentation.sox_augment_combine(
            signal,
            min_bass_gain_high = random.randint(25, 70),
            min_bass_gain_low = random.randint(5, 30),
            reverberance = random.randint(0, 30),
            hf_damping = 10,
            room_scale = random.randint(0, 30),
        )
    if choice == 4:
        x = augmentation.sox_reverb(
            signal,
            reverberance = random.randint(10, 30),
            hf_damping = 10,
            room_scale = random.randint(10, 30),
        )
    if choice == 5:
        x = random_amplitude_threshold(
            signal, threshold = random.uniform(0.35, 0.8)
        )

    if choice > 5:
        x = signal

    if choice != 5 and random.gauss(0.5, 0.14) > 0.6:
        x = random_amplitude_threshold(
            x, low = 1.0, high = 2.0, threshold = random.uniform(0.7, 0.9)
        )

    if random.gauss(0.5, 0.14) > 0.6 and add_uniform:
        x = augmentation.add_uniform_noise(
            x, power = random.uniform(0.005, 0.015)
        )

    return x

def signal_augmentation(wav):
    seed = random.randint(0, 100_000_000)
    wav = calc(wav, seed)
    if random.gauss(0.5, 0.14) > 0.6:
        n = combine_speakers(noises, random.randint(1, 20))[0]
        n = calc(n, seed, True)
        combined = augmentation.add_noise(
            wav, n, factor = random.uniform(0.05, 0.3)
        )
    else:
        combined = wav
    return combined.astype('float32')

In [6]:
def mel_augmentation(features):
    features = mask_augmentation.mask_frequency(features)
    return mask_augmentation.mask_time(features)

In [7]:
def preprocess_inputs(example):
    w = tf.compat.v1.numpy_function(signal_augmentation, [example['waveforms']], tf.float32)
    w = tf.reshape(w, (1, -1))
    s = featurizer.vectorize(w[0])
    s = tf.reshape(s, (-1, n_mels))
    s = tf.compat.v1.numpy_function(mel_augmentation, [s], tf.float32)
    mel_fbanks = tf.reshape(s, (-1, n_mels))
    length = tf.cast(tf.shape(mel_fbanks)[0], tf.int32)
    length = tf.expand_dims(length, 0)
    example['waveforms'] = w[0]
    example['inputs'] = mel_fbanks
    example['inputs_length'] = length

    return example

def parse(serialized_example):

    data_fields = {
        'waveforms': tf.VarLenFeature(tf.float32),
        'targets': tf.VarLenFeature(tf.int64),
    }
    features = tf.parse_single_example(
        serialized_example, features = data_fields
    )
    for k in features.keys():
        features[k] = features[k].values

    features = preprocess_inputs(features)

    keys = list(features.keys())
    for k in keys:
        if k not in ['waveforms', 'inputs', 'inputs_length', 'targets']:
            features.pop(k, None)

    return features


def get_dataset(path, batch_size = 32, shuffle_size = 32, thread_count = 24):
    def get():
        files = glob(path)
        dataset = tf.data.TFRecordDataset(files)
        dataset = dataset.shuffle(shuffle_size)
        dataset = dataset.map(parse, num_parallel_calls = thread_count)
        dataset = dataset.padded_batch(
            batch_size,
            padded_shapes = {
                'waveforms': tf.TensorShape([None]),
                'inputs': tf.TensorShape([None, n_mels]),
                'inputs_length': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
            },
            padding_values = {
                'waveforms': tf.constant(0, dtype = tf.float32),
                'inputs': tf.constant(0, dtype = tf.float32),
                'inputs_length': tf.constant(0, dtype = tf.int32),
                'targets': tf.constant(0, dtype = tf.int64),
            },
        )
        dataset = dataset.repeat()
        return dataset

    return get

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
iterator = get_dataset('../speech-bahasa/bahasa-asr/data/bahasa-asr-train-*')()
iterator = iterator.make_one_shot_iterator().get_next()
r = sess.run(iterator)



Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


In [9]:
r

{'targets': array([[109, 107, 112, ...,   0,   0,   0],
        [117,  99, 112, ...,   0,   0,   0],
        [ 77,  99, 111, ...,   0,   0,   0],
        ...,
        [117,  99, 111, ...,   0,   0,   0],
        [ 75,  41, 120, ...,   0,   0,   0],
        [121, 106,  99, ...,   0,   0,   0]]),
 'waveforms': array([[ 0.01258332,  0.01541021,  0.01594631, ...,  0.        ,
          0.        ,  0.        ],
        [-0.02102088, -0.02487239, -0.03220418, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.00532117,  0.00368615, -0.00168939, ...,  0.        ,
          0.        ,  0.        ],
        [-0.10775989, -0.04397684,  0.00240064, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00651675,  0.00912982,  0.01100481, ...,  0.        ,
          0.        ,  0.        ]], dtype=float32),
 'inputs': array([[[-1.2212915 , -1.3845913 , -1.