In [1]:
from glob import glob
from collections import defaultdict
import pandas as pd

In [2]:
librispeech = glob('LibriSpeech/*/*/*/*.flac')
len(librispeech)

33862

In [3]:
librispeech_speakers = {}

with open('LibriSpeech/SPEAKERS.TXT') as fopen:
    speakers = fopen.read()
speakers = speakers.split(';')[-1].split('\n')[1:]
for s in speakers:
    splitted = s.split('|')
    if len(splitted) > 2:
        i, g = splitted[:2]
        librispeech_speakers[i.strip()] = g.strip().lower()

In [4]:
def get_speaker_librispeech(file):
    return file.split('/')[-1].split('-')[0]

In [5]:
librispeech_speakers.get(get_speaker_librispeech(librispeech[0]))

'm'

In [6]:
# !wget http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/vox2_meta.csv

In [7]:
df = pd.read_csv('vox2_meta.csv')
voxceleb = glob('voxceleb-wav/*.wav', recursive = True)
len(voxceleb)

1092009

In [8]:
def get_speaker_voxceleb(file):
    return file.split('/')[-1].split('-')[2]

In [10]:
voxceleb_speakers = {}
for i in range(len(df)):
    voxceleb_speakers[df.iloc[i,0].strip()] = df.iloc[i,-2].strip()

In [11]:
get_speaker_voxceleb(voxceleb[0])

'id03556'

In [12]:
voxceleb_speakers.get(get_speaker_voxceleb(voxceleb[0]))

'f'

In [14]:
import soundfile as sf
from scipy import interpolate
import numpy as np
import librosa

def resample(data, old_samplerate, new_samplerate):
    old_audio = data
    duration = data.shape[0] / old_samplerate
    time_old = np.linspace(0, duration, old_audio.shape[0])
    time_new = np.linspace(
        0, duration, int(old_audio.shape[0] * new_samplerate / old_samplerate)
    )

    interpolator = interpolate.interp1d(time_old, old_audio.T)
    data = interpolator(time_new).T
    return data

def read_wav(file, sample_rate = 16000):
    y, sr = sf.read(file)
    if sr != sample_rate:
        y = resample(y, sr, sample_rate)
    return y, sample_rate

def sampling(combined, frame_duration_ms = 700, sample_rate = 16000):
    n = int(sample_rate * (frame_duration_ms / 1000.0))
    offset = 0
    while offset + n <= len(combined):
        yield combined[offset : offset + n]
        offset += n
    if offset < len(combined):
        yield combined[offset:]

In [17]:
from scipy.special import expit

def sox_reverb(
    y, reverberance = 1, hf_damping = 1, room_scale = 1, stereo_depth = 1
):
    from pysndfx import AudioEffectsChain

    apply_audio_effects = AudioEffectsChain().reverb(
        reverberance = reverberance,
        hf_damping = hf_damping,
        room_scale = room_scale,
        stereo_depth = stereo_depth,
        pre_delay = 20,
        wet_gain = 0,
        wet_only = False,
    )
    y_enhanced = apply_audio_effects(y)

    return y_enhanced


def sox_augment_low(
    y,
    min_bass_gain = 5,
    reverberance = 1,
    hf_damping = 1,
    room_scale = 1,
    stereo_depth = 1,
    negate = 1,
):
    from pysndfx import AudioEffectsChain

    if negate:
        min_bass_gain = -min_bass_gain
    apply_audio_effects = (
        AudioEffectsChain()
        .lowshelf(gain = min_bass_gain, frequency = 300, slope = 0.1)
        .reverb(
            reverberance = reverberance,
            hf_damping = hf_damping,
            room_scale = room_scale,
            stereo_depth = stereo_depth,
            pre_delay = 20,
            wet_gain = 0,
            wet_only = False,
        )
    )
    y_enhanced = apply_audio_effects(y)

    return y_enhanced


def sox_augment_high(
    y,
    min_bass_gain = 5,
    reverberance = 1,
    hf_damping = 1,
    room_scale = 1,
    stereo_depth = 1,
    negate = 1,
):
    from pysndfx import AudioEffectsChain

    if negate:
        min_bass_gain = -min_bass_gain

    apply_audio_effects = (
        AudioEffectsChain()
        .highshelf(
            gain = -min_bass_gain * (1 - expit(np.max(y))),
            frequency = 300,
            slope = 0.1,
        )
        .reverb(
            reverberance = reverberance,
            hf_damping = hf_damping,
            room_scale = room_scale,
            stereo_depth = stereo_depth,
            pre_delay = 20,
            wet_gain = 0,
            wet_only = False,
        )
    )
    y_enhanced = apply_audio_effects(y)

    return y_enhanced


def sox_augment_combine(
    y,
    min_bass_gain_low = 5,
    min_bass_gain_high = 5,
    reverberance = 1,
    hf_damping = 1,
    room_scale = 1,
    stereo_depth = 1,
):
    from pysndfx import AudioEffectsChain

    apply_audio_effects = (
        AudioEffectsChain()
        .lowshelf(gain = min_bass_gain_low, frequency = 300, slope = 0.1)
        .highshelf(gain = -min_bass_gain_high, frequency = 300, slope = 0.1)
        .reverb(
            reverberance = reverberance,
            hf_damping = hf_damping,
            room_scale = room_scale,
            stereo_depth = stereo_depth,
            pre_delay = 20,
            wet_gain = 0,
            wet_only = False,
        )
    )
    y_enhanced = apply_audio_effects(y)

    return y_enhanced


def random_pitch(sample, low = 0.5, high = 1.0):
    y_pitch_speed = sample.copy()
    length_change = np.random.uniform(low = low, high = high)
    speed_fac = 1.0 / length_change
    tmp = np.interp(
        np.arange(0, len(y_pitch_speed), speed_fac),
        np.arange(0, len(y_pitch_speed)),
        y_pitch_speed,
    )
    minlen = min(y_pitch_speed.shape[0], tmp.shape[0])
    y_pitch_speed *= 0
    y_pitch_speed[:minlen] = tmp[:minlen]
    return y_pitch_speed


def random_amplitude(sample, low = 1.5, high = 3):
    y_aug = sample.copy()
    dyn_change = np.random.uniform(low = low, high = high)
    return y_aug * dyn_change


def random_stretch(sample, low = 0.5, high = 1.3):
    input_length = len(sample)
    stretching = sample.copy()
    random_stretch = np.random.uniform(low = low, high = high)
    stretching = librosa.effects.time_stretch(
        stretching.astype('float'), random_stretch
    )
    return stretching

def add_uniform_noise(sample, power = 0.01):
    y_noise = sample.copy()
    noise_amp = power * np.random.uniform() * np.amax(y_noise)
    return y_noise.astype('float64') + noise_amp * np.random.normal(
        size = y_noise.shape[0]
    )


def add_noise(sample, noise, random_sample = True, factor = 0.1):
    y_noise = sample.copy()
    if len(y_noise) > len(noise):
        noise = np.tile(noise, int(np.ceil(len(y_noise) / len(noise))))
    else:
        if random_sample:
            noise = noise[np.random.randint(0, len(noise) - len(y_noise) + 1) :]
    return y_noise + noise[: len(y_noise)] * factor

def sampling(combined, frame_duration_ms = 700, sample_rate = 16000):
    n = int(sample_rate * (frame_duration_ms / 1000.0))
    offset = 0
    while offset + n <= len(combined):
        yield combined[offset : offset + n]
        offset += n
    if offset < len(combined):
        yield combined[offset:]

In [18]:
def calc(signal):

    choice = random.randint(0, 4)
    if choice == 0:

        x = sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 50),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 1,
        )
    if choice == 1:
        x = sox_augment_high(
            signal,
            min_bass_gain = random.randint(25, 70),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = 0,
        )
    if choice == 2:
        x = sox_augment_low(
            signal,
            min_bass_gain = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 50),
            negate = random.randint(0, 1),
        )
    if choice == 3:
        x = sox_augment_combine(
            signal,
            min_bass_gain_high = random.randint(25, 70),
            min_bass_gain_low = random.randint(5, 30),
            reverberance = random.randint(0, 80),
            hf_damping = 10,
            room_scale = random.randint(0, 90),
        )
    if choice == 4:
        x = sox_reverb(
            signal,
            reverberance = random.randint(10, 80),
            hf_damping = 10,
            room_scale = random.randint(10, 90),
        )

    if random.randint(0, 1):
        x = add_uniform_noise(
            x, power = random.uniform(0.005, 0.015)
        )
        
    if random.random() > 0.75:
        r = random.choice(not_music)
        n = read_wav(r)[0]
        x = add_noise(x, n, factor = random.uniform(0.005, 0.01))

    return x

In [39]:
actual_labels = ['female', 'male', 'not a gender']
mapping = {'f': 'female', 'm': 'male'}

In [40]:
import random

files, labels = [], []

for f in random.sample(librispeech, 10000):
    l = librispeech_speakers.get(get_speaker_librispeech(f))
    if l:
        labels.append(mapping[l])
        files.append(f)
        
for f in random.sample(voxceleb, 100000):
    l = voxceleb_speakers.get(get_speaker_voxceleb(f))
    if l:
        labels.append(mapping[l])
        files.append(f)
        
len(files), len(labels)

(110000, 110000)

In [41]:
not_music = glob('not-music/clean-wav/*.wav') + glob('musan/music/**/*.wav', recursive = True) \
+ glob('musan/noise/**/*.wav', recursive = True)
files = files + not_music
labels = labels + ['not a gender'] * len(not_music)

In [42]:
from sklearn.utils import shuffle
files, labels = shuffle(files, labels)

In [43]:
y, sr = read_wav(files[-2])
y = y[:sr * 10]
y, sr, labels[-2]

(array([-0.0184021 ,  0.07171631,  0.16143799, ..., -0.02212524,
        -0.01934814, -0.01745605]),
 16000,
 'female')

In [44]:
calc(y)

array([-0.10992366,  0.44814456,  0.90128332, ..., -0.01737537,
        0.01271827,  0.03356281])

In [45]:
import os
import tensorflow as tf

os.system('rm gender/data/*')
DATA_DIR = os.path.expanduser('gender/data')
tf.gfile.MakeDirs(DATA_DIR)

In [46]:
from tqdm import tqdm
from malaya_speech.train import prepare_data
from collections import defaultdict

def loop(files, dupe_factor = 1):
    files, no = files
    fname = f'{DATA_DIR}/part-{no}.tfrecords'
    writer = tf.python_io.TFRecordWriter(fname)
    counts = defaultdict(int)
    for file in tqdm(files):
        try:
            wav = read_wav(file[0])[0]
            if file[1] != 'not a gender':
                d = dupe_factor
            else:
                d = 1
            for _ in range(d):
                if file[1] != 'not a gender':
                    minimum = 1000
                else:
                    minimum = 200
                fs = sampling(wav, random.randint(minimum, 2000))
                for s in fs:
                    try:
                        if file[1] != 'not a gender':
                            for _ in range(dupe_factor):
                                n = calc(s)
                                if len(n) > 50:
                                    example = prepare_data.to_example({'inputs': n.tolist(), 
                                                                       'targets': [actual_labels.index(file[1])]})
                                    writer.write(example.SerializeToString())
                                    counts[file[1]] += 1
                            n = s
                        else:
                            n = s
                        if len(n) > 50:
                            example = prepare_data.to_example({'inputs': n.tolist(), 
                                                               'targets': [actual_labels.index(file[1])]})
                            writer.write(example.SerializeToString())
                            counts[file[1]] += 1
                    except Exception as e:
                        print(e)
                        pass
        except Exception as e:
            print(e)
            pass

    writer.close()
    return [counts]

In [47]:
combined_all = list(zip(files, labels))

In [48]:
import mp
returned = mp.multiprocessing(combined_all, loop, cores = 10)

100%|██████████| 11202/11202 [1:44:11<00:00,  1.79it/s] 
100%|██████████| 6/6 [00:04<00:00,  1.37it/s] 2.23it/s]
100%|██████████| 11202/11202 [1:45:13<00:00,  1.77it/s]
100%|██████████| 11202/11202 [1:45:31<00:00,  1.77it/s]
100%|██████████| 11202/11202 [1:45:38<00:00,  1.77it/s]
100%|██████████| 11202/11202 [1:46:02<00:00,  1.76it/s]
100%|██████████| 11202/11202 [1:46:05<00:00,  1.76it/s]
100%|██████████| 11202/11202 [1:46:11<00:00,  1.76it/s]
100%|██████████| 11202/11202 [1:46:13<00:00,  1.76it/s]
100%|██████████| 11202/11202 [1:46:19<00:00,  1.76it/s]
100%|██████████| 11202/11202 [1:46:21<00:00,  1.76it/s]


In [49]:
combined_d = defaultdict(int)
for d in returned:
    for k, v in d.items():
        combined_d[k] += v
combined_d

defaultdict(int, {'male': 831810, 'not a gender': 503997, 'female': 525766})