In [1]:
from glob import glob
from collections import defaultdict

In [2]:
librispeech = glob('LibriSpeech/*/*/*/*.flac')
len(librispeech)

33862

In [3]:
def get_speaker_librispeech(file):
    return file.split('/')[-1].split('-')[0]

speakers = defaultdict(list)
for f in librispeech:
    speakers[get_speaker_librispeech(f)].append(f)
    
len(speakers)

331

In [4]:
voxceleb = glob('voxceleb-wav/*.wav', recursive = True)
len(voxceleb)

1092009

In [5]:
def get_speaker_voxceleb(file):
    return file.split('/')[-1].split('-')[2]

voxceleb_speakers = defaultdict(list)
for f in voxceleb:
    voxceleb_speakers[get_speaker_voxceleb(f)].append(f)
    
len(voxceleb_speakers)

5994

In [6]:
import random

def combine_speakers(files, n = 5, sr = 16000):
    w_samples = random.sample(files, n)
    w_samples = [
        random_sampling(
            read_wav(f)[0],
            length = min(random.randint(20000 // n, 240_000 // n), 100_000 // n),
        )
        for f in w_samples
    ]
    y = [w_samples[0]]
    left = w_samples[0].copy() * random.uniform(0.5, 1.0)
    timestamps = [(0 / sr, len(left) / sr)]
    for i in range(1, n):
        right = w_samples[i].copy() * random.uniform(0.5, 1.0)

        overlap = random.uniform(0.01, 1.25)
        left_len = int(overlap * len(left))
        
        padded_right = np.pad(right, (left_len, 0))
        timestamps.append((left_len / sr, (left_len + len(right)) / sr))

        if len(left) > len(padded_right):
            padded_right = np.pad(
                padded_right, (0, len(left) - len(padded_right))
            )
        else:
            left = np.pad(left, (0, len(padded_right) - len(left)))

        y.append(padded_right)
        left = left + padded_right
    return left, y, np.array(timestamps)

In [7]:
import soundfile as sf
import numpy as np
import soundfile as sf
from scipy import interpolate
import numpy as np
import librosa
from scipy.special import expit

np.seterr(all='raise')

def resample(data, old_samplerate, new_samplerate):
    old_audio = data
    duration = data.shape[0] / old_samplerate
    time_old = np.linspace(0, duration, old_audio.shape[0])
    time_new = np.linspace(
        0, duration, int(old_audio.shape[0] * new_samplerate / old_samplerate)
    )

    interpolator = interpolate.interp1d(time_old, old_audio.T)
    data = interpolator(time_new).T
    return data

def read_wav(file, sample_rate = 16000):
    y, sr = sf.read(file)
    if sr != sample_rate:
        y = resample(y, sr, sample_rate)
    return y, sample_rate

def random_sampling(sample, sr = 16000, length = 500):
    sr = int(sr / 1000)
    up = len(sample) - (sr * length)
    if up < 1:
        r = 0
    else:
        r = np.random.randint(0, up)
    return sample[r : r + sr * length]

def sox_reverb(
    y, reverberance = 1, hf_damping = 1, room_scale = 1, stereo_depth = 1
):
    from pysndfx import AudioEffectsChain

    apply_audio_effects = AudioEffectsChain().reverb(
        reverberance = reverberance,
        hf_damping = hf_damping,
        room_scale = room_scale,
        stereo_depth = stereo_depth,
        pre_delay = 20,
        wet_gain = 0,
        wet_only = False,
    )
    y_enhanced = apply_audio_effects(y)

    return y_enhanced


def sox_augment_low(
    y,
    min_bass_gain = 5,
    reverberance = 1,
    hf_damping = 1,
    room_scale = 1,
    stereo_depth = 1,
    negate = 1,
):
    from pysndfx import AudioEffectsChain

    if negate:
        min_bass_gain = -min_bass_gain
    apply_audio_effects = (
        AudioEffectsChain()
        .lowshelf(gain = min_bass_gain, frequency = 300, slope = 0.1)
        .reverb(
            reverberance = reverberance,
            hf_damping = hf_damping,
            room_scale = room_scale,
            stereo_depth = stereo_depth,
            pre_delay = 20,
            wet_gain = 0,
            wet_only = False,
        )
    )
    y_enhanced = apply_audio_effects(y)

    return y_enhanced


def sox_augment_high(
    y,
    min_bass_gain = 5,
    reverberance = 1,
    hf_damping = 1,
    room_scale = 1,
    stereo_depth = 1,
    negate = 1,
):
    from pysndfx import AudioEffectsChain

    if negate:
        min_bass_gain = -min_bass_gain

    apply_audio_effects = (
        AudioEffectsChain()
        .highshelf(
            gain = -min_bass_gain * (1 - expit(np.max(y))),
            frequency = 300,
            slope = 0.1,
        )
        .reverb(
            reverberance = reverberance,
            hf_damping = hf_damping,
            room_scale = room_scale,
            stereo_depth = stereo_depth,
            pre_delay = 20,
            wet_gain = 0,
            wet_only = False,
        )
    )
    y_enhanced = apply_audio_effects(y)

    return y_enhanced


def sox_augment_combine(
    y,
    min_bass_gain_low = 5,
    min_bass_gain_high = 5,
    reverberance = 1,
    hf_damping = 1,
    room_scale = 1,
    stereo_depth = 1,
):
    from pysndfx import AudioEffectsChain

    apply_audio_effects = (
        AudioEffectsChain()
        .lowshelf(gain = min_bass_gain_low, frequency = 300, slope = 0.1)
        .highshelf(gain = -min_bass_gain_high, frequency = 300, slope = 0.1)
        .reverb(
            reverberance = reverberance,
            hf_damping = hf_damping,
            room_scale = room_scale,
            stereo_depth = stereo_depth,
            pre_delay = 20,
            wet_gain = 0,
            wet_only = False,
        )
    )
    y_enhanced = apply_audio_effects(y)

    return y_enhanced


def random_pitch(sample, low = 0.5, high = 1.0):
    y_pitch_speed = sample.copy()
    length_change = np.random.uniform(low = low, high = high)
    speed_fac = 1.0 / length_change
    tmp = np.interp(
        np.arange(0, len(y_pitch_speed), speed_fac),
        np.arange(0, len(y_pitch_speed)),
        y_pitch_speed,
    )
    minlen = min(y_pitch_speed.shape[0], tmp.shape[0])
    y_pitch_speed *= 0
    y_pitch_speed[:minlen] = tmp[:minlen]
    return y_pitch_speed


def random_amplitude(sample, low = 1.5, high = 3):
    y_aug = sample.copy()
    dyn_change = np.random.uniform(low = low, high = high)
    return y_aug * dyn_change


def random_stretch(sample, low = 0.5, high = 1.3):
    input_length = len(sample)
    stretching = sample.copy()
    random_stretch = np.random.uniform(low = low, high = high)
    stretching = librosa.effects.time_stretch(
        stretching.astype('float'), random_stretch
    )
    return stretching

def add_uniform_noise(sample, power = 0.01):
    y_noise = sample.copy()
    noise_amp = power * np.random.uniform() * np.amax(y_noise)
    return y_noise.astype('float64') + noise_amp * np.random.normal(
        size = y_noise.shape[0]
    )

def add_noise(sample, noise, random_sample = True, factor = 0.1):
    y_noise = sample.copy()
    if len(y_noise) > len(noise):
        noise = np.tile(noise, int(np.ceil(len(y_noise) / len(noise))))
    else:
        if random_sample:
            noise = noise[np.random.randint(0, len(noise) - len(y_noise) + 1) :]
    return y_noise + noise[: len(y_noise)] * factor

def sampling(combined, frame_duration_ms = 700, sample_rate = 16000):
    n = int(sample_rate * (frame_duration_ms / 1000.0))
    offset = 0
    while offset + n <= len(combined):
        yield combined[offset : offset + n]
        offset += n
    if offset < len(combined):
        yield combined[offset:]



In [8]:
class FRAME:
    def __init__(self, array, timestamp, duration):
        self.array = array
        self.timestamp = timestamp
        self.duration = duration
        
def generate_frames(
    audio,
    frame_duration_ms: int = 30,
    sample_rate: int = 16000,
    append_ending_trail: bool = True,
):
    n = int(sample_rate * (frame_duration_ms / 1000.0))
    offset = 0
    timestamp = 0.0
    duration = float(n) / sample_rate
    results = []
    while offset + n <= len(audio):
        results.append(FRAME(audio[offset : offset + n], timestamp, duration))
        timestamp += duration
        offset += n
    if append_ending_trail and offset < len(audio):
        results.append(
            FRAME(
                audio[offset:], timestamp, len(audio) / sample_rate - timestamp
            )
        )
    return results

In [9]:
def calc(signal):

    choice = random.randint(0, 4)
    if choice == 0:

        x = sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 50),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 1,
        )
    if choice == 1:
        x = sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 70),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 0,
        )
    if choice == 2:
        x = sox_augment_low(
            signal,
            min_bass_gain = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = random.randint(0, 1),
        )
    if choice == 3:
        x = sox_augment_combine(
            signal,
            min_bass_gain_high = random.randint(25, 70),
            min_bass_gain_low = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 90),
        )
    if choice == 4:
        x = sox_reverb(
            signal,
            reverberance = random.randint(10, 80),
            hf_damping = 10,
            room_scale = random.randint(10, 90),
        )

    if random.randint(0, 1):
        x = add_uniform_noise(
            x, power = random.uniform(0.005, 0.015)
        )
        
    if random.random() > 0.75:
        r = random.choice(not_music)
        n = read_wav(r)[0]
        x = add_noise(x, n, factor = random.uniform(0.005, 0.01))

    return x

In [10]:
not_music = glob('not-music/clean-wav/*.wav') + glob('musan/music/**/*.wav', recursive = True) \
+ glob('musan/noise/**/*.wav', recursive = True)
len(not_music)

2026

In [11]:
y, _, _ = combine_speakers(not_music, 1)
len(y) / 16000 / 60

1.6666666666666667

In [12]:
def generate_noise():
    n = np.random.normal(-random.uniform(0,0.5),random.uniform(0,0.5),random.randint(16000 * 10, 16000 * 120))
    n = n * random.uniform(0.0001, 0.8)
    n[random.randint(0, len(n) -1)] = 1
    n[random.randint(0, len(n) -1)] = -1
    return n

In [13]:
# import IPython.display as ipd

# ipd.Audio(generate_noise(), rate = 16000)

In [14]:
s = {**voxceleb_speakers, **speakers}
keys = list(s.keys())

In [15]:
def random_speakers(n):
    ks = random.sample(keys, n)
    r = []
    for k in ks:
        r.append(random.choice(s[k]))
    return r

In [16]:
count = 20
y, _, timestamps = combine_speakers(random_speakers(count), count)
frames = generate_frames(y)

In [17]:
Y = []
for no, frame in enumerate(frames):
    speakers = np.where((frame.timestamp >= timestamps[:,0]) & (frame.timestamp <= timestamps[:,1]))[0]
    Y.append(len(speakers))
    
Y = np.expand_dims(np.array(Y), -1)
Y = np.sum(np.abs(np.diff(Y, axis=0)), axis=1, keepdims=True)
Y = np.vstack(([[0]], Y > 0))
Y = Y[:,0]

In [18]:
len(frames), len(Y)

(1712, 1712)

In [19]:
import os
import tensorflow as tf

os.system('rm speaker-change/data/*')
DATA_DIR = os.path.expanduser('speaker-change/data')
tf.gfile.MakeDirs(DATA_DIR)

In [20]:
from tqdm import tqdm
from malaya_speech.train import prepare_data
from collections import defaultdict
import traceback

selected_frames = [30, 90]

def loop(files, dupe_factor = 5000):
    _, no = files
    fname = f'{DATA_DIR}/part-{no}.tfrecords'
    writer = tf.python_io.TFRecordWriter(fname)
    counts = defaultdict(int)
    for _ in tqdm(range(dupe_factor)):
        if random.random() > 0.3:
            count = random.randint(1, 20)
        else:
            count = 0
        try:
            if count == 0:
                if random.random() > 0.2:
                    y, _, _ = combine_speakers(not_music, random.randint(1, 20))
                    y = calc(y)
                else:
                    y = generate_noise()
                
            else:
                y, _, timestamps = combine_speakers(random_speakers(count), count)
                if random.random() > 0.7:
                    y = calc(y)
                if random.random() > 0.7:
                    s_, _, _ = combine_speakers(not_music, random.randint(1, 20))
                    y = add_noise(y, s_, factor = random.uniform(0.2, 0.8))
                
            for f in selected_frames:
                frames = generate_frames(y, f)
                if count == 0:
                    Y = [0] * len(frames)
                else:
                    Y = []
                    for no, frame in enumerate(frames):
                        speakers = np.where((frame.timestamp >= timestamps[:,0]) & (frame.timestamp <= timestamps[:,1]))[0]
                        Y.append(len(speakers))

                    Y = np.expand_dims(np.array(Y), -1)
                    Y = np.sum(np.abs(np.diff(Y, axis=0)), axis=1, keepdims=True)
                    Y = np.vstack(([[0]], Y > 0))
                    Y = Y[:,0]
                    
                for no, frame in enumerate(frames):
                    example = prepare_data.to_example({'inputs': frame.array.tolist(), 
                                                       'targets': [Y[no]]})
                    writer.write(example.SerializeToString())
                    counts[Y[no]] += 1
        except Exception as e:
            print(traceback.format_exc())
            
    writer.close()
    return [counts]

In [None]:
import mp
returned = mp.multiprocessing([10] * 12, loop, cores = 12)

 83%|████████▎ | 4143/5000 [1:28:18<20:22,  1.43s/it]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 94%|█████████▍| 4698/5000 [1:38:44<05:21,  1.07s/it]

In [23]:
combined_d = defaultdict(int)
for d in returned:
    for k, v in d.items():
        combined_d[k] += v
combined_d

defaultdict(int, {0: 92074635, 1: 1537795})