In [277]:
import os
import random as _random
import numpy
from sklearn.model_selection import KFold

In [278]:
RND = 123
N_FOLDS = 10
N_HOLDOUT_SAMPLES = 4000
N_HOLDOUT_FILES = int(N_HOLDOUT_SAMPLES / 12. * 11)
print('N_HOLDOUT_FILES:', N_HOLDOUT_FILES)
N_TRAIN = 1000000
OUT_DIR = 'out'

N_HOLDOUT_FILES: 3666


In [279]:
_random.seed(RND)
numpy.random.seed(RND)

In [280]:
%run '../data-generator.ipynb'

In [221]:
# remove/create dirs
os.makedirs(OUT_DIR, exist_ok=True)
shutil.rmtree('%s/holdout' % OUT_DIR, ignore_errors=True)
os.makedirs('%s/holdout' % OUT_DIR, exist_ok=True)
shutil.rmtree('%s/val' % OUT_DIR, ignore_errors=True)
os.makedirs('%s/val' % OUT_DIR, exist_ok=True)

In [281]:
dg = DataGenerator('/d2/caches/tf-speech/train/audio')

# adjust mixing options
for k, v in dg.mix_with.items():
    if '_background_noise_' in k:
        v['probability'] = 0.1

In [223]:
# get some stats on input data
counts = {k: len(v) for k, v in dg.input_files.items()}
print(counts)
del counts['unknown']
print('total samples for known words:', sum(counts.values()))
print('avg per known label:', sum(counts.values()) / len(counts))

{'right': 2367, 'no': 2375, 'stop': 2380, 'left': 2353, 'off': 2357, 'on': 2367, 'down': 2359, 'yes': 2377, 'up': 2375, 'unknown': 41039, 'go': 2372}
total samples for known words: 23682
avg per known label: 2368.2


In [224]:
# create list of all input files
input_files = []
for label, files in dg.input_files.items():
    input_files.extend(files)

np.random.shuffle(input_files)

# balance number of 'unknown' labels
MAX_UNKNOWN = 2357
unknown = 0
for i, f in enumerate(input_files):
    if dg.get_label(f) == 'unknown':
        unknown += 1
        if unknown > MAX_UNKNOWN:
            input_files[i] = None

input_files = list(filter(lambda x: x is not None, input_files))
dg.input_files = input_files

In [225]:
print('numer of unknown label files:', len(dg.input_files['unknown']))

numer of unknown label files: 2357


In [233]:
# generate houldout set
np.random.shuffle(input_files)
holdout_files = input_files[:N_HOLDOUT_FILES]

dg.val_files = {file: dg.get_label(file) for file in holdout_files}

holdout_X, holdout_Y, holdout_files = dg.generate_val_set(
    n=N_HOLDOUT_SAMPLES, return_files_list=True)

np.save('%s/holdout/holdout_files.npy' % OUT_DIR, holdout_files)
np.save('%s/holdout/holdout_X.npy' % OUT_DIR, holdout_X)
np.save('%s/holdout/holdout_Y.npy' % OUT_DIR, holdout_Y)

In [235]:
# remove holdout files from input
input_files = list(set(input_files) ^ set(holdout_files))
dg.input_files = input_files

In [243]:
k = KFold(n_splits=N_FOLDS)
s = k.split(input_files)

i = 0

for train, test in s:
    print('fold %d/%d of size %d...' % (1 + i, N_FOLDS, len(test)))

    dg.val_files = {
        file: dg.get_label(file)
        for file in np.array(input_files)[test]
    }
    
    n_val_samples = int(len(test)/11.*12)
    print('n_val_samples:', n_val_samples)

    val_X, val_Y, val_files = dg.generate_val_set(
        n=N_VAL_SAMPLES, return_files_list=True)
    np.save('%s/val/val_files_%d.npy' % (OUT_DIR, i), val_files)
    np.save('%s/val/val_X_%d.npy' % (OUT_DIR, i), val_X)
    np.save('%s/val/val_Y_%d.npy' % (OUT_DIR, i), val_Y)

    i += 1
    
print('done')

fold 1/10 of size 2604...
n_val_samples: 2840
fold 2/10 of size 2604...
n_val_samples: 2840


KeyboardInterrupt: 

In [None]:
def _gen_train_set():
    dg.input_files = None
    dg.val_files = {file: dg.get_label(file) for file in holdout_files}

    dg.generate_train_set(
        n_total=N_TRAIN,
        n_per_job=1000,
        n_pools=16,
        X_file='%s/train_X.mem' % OUT_DIR,
        Y_file='%s/train_Y.mem' % OUT_DIR,
        files_file='%s/files.npy' % OUT_DIR,
        tmp_dir='%s/train_tmp' % OUT_DIR)

In [9]:
%time _gen_train_set()