In [1]:
import os
import shutil
import numpy as np

In [2]:
def copy_all_files(source_dir, dest_dir):
    for filename in os.listdir(source_dir):
        full_filename = os.path.join(source_dir, filename)
        shutil.copy(full_filename, dest_dir)

In [3]:
def list_with_full_paths(source_dir):
    return [os.path.join(source_dir, file) for file in os.listdir(source_dir)]

In [4]:
def randomly_split_in_half(source_dir):
    '''Returns full paths'''
    files = list_with_full_paths(source_dir)
    np.random.shuffle(files)
    split_point = len(files) // 2
    return files[:split_point], files[split_point:]

In [5]:
data_dir = 'data'
set_dirs = [os.path.join(data_dir, set_dir) for set_dir in ('train-set', 'dev-set', 'test-set')]

In [6]:
os.mkdir(data_dir)

In [7]:
for dir_ in set_dirs:
    os.mkdir(dir_)

In [8]:
commands = [
    'on', 'off',
    'left', 'right', 'up', 'down',
    'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'zero'
]

In [9]:
for outer_dir in set_dirs:
    for inner_dir in commands:
        os.mkdir(os.path.join(outer_dir, inner_dir))

In [10]:
data_source_dir = os.path.join('data-raw', 'augmented_dataset', 'augmented_dataset')
noisy_data_source_dir = os.path.join('data-raw', 'augmented_dataset_verynoisy', 'augmented_dataset_verynoisy')

In [11]:
training_set_dir = os.path.join(data_dir, 'train-set')
dev_set_dir = os.path.join(data_dir, 'dev-set')
test_set_dir = os.path.join(data_dir, 'test-set')

In [12]:
# Valid commands, training set
for command_name in commands:
    source_dir = os.path.join(data_source_dir, command_name)
    dest_dir = os.path.join(training_set_dir, command_name)
    copy_all_files(source_dir, dest_dir)

In [16]:
# Other commands, training set
training_set_all_other_files = list()
training_set_other_commands_dir = os.path.join(training_set_dir, 'other')
os.mkdir(training_set_other_commands_dir)
for dir_ in os.listdir(data_source_dir):
    if dir_ not in commands:
        full_dir = os.path.join(data_source_dir, dir_)
        training_set_all_other_files += [os.path.join(full_dir, file) for file in os.listdir(full_dir)]
        
for n, file in enumerate(training_set_all_other_files):
    shutil.copy(file, os.path.join(training_set_other_commands_dir, f'{n}.wav'))

In [18]:
# Dev and test set (noisy)
other_commands_files = list() 
dev_set_other_commands_dir = os.path.join(dev_set_dir, 'other')
test_set_other_commands_dir = os.path.join(test_set_dir, 'other')
os.mkdir(dev_set_other_commands_dir)
os.mkdir(test_set_other_commands_dir)
for dir_ in os.listdir(noisy_data_source_dir):
    # If valid command, split into dev and test examples
    if dir_ in commands:
        dev_files, test_files = randomly_split_in_half(os.path.join(noisy_data_source_dir, dir_))
        for n, file in enumerate(dev_files):
            shutil.copy(file, os.path.join(dev_set_dir, dir_, f'{n}.wav'))
        for n, file in enumerate(test_files):
            shutil.copy(file, os.path.join(test_set_dir, dir_, f'{n}.wav'))
    # Otherwise, add to "other" files list
    else:
        other_commands_files += list_with_full_paths(os.path.join(noisy_data_source_dir, dir_))
# Split "other" files into dev and test set
np.random.shuffle(other_commands_files)
split_point = len(other_commands_files) // 2
dev_others, test_others = other_commands_files[:split_point], other_commands_files[split_point:]
# Copy "other" examples for dev and test sets
for n, file in enumerate(dev_others):
    shutil.copy(file, os.path.join(dev_set_other_commands_dir, f'{n}.wav'))
for n, file in enumerate(test_others):
    shutil.copy(file, os.path.join(test_set_other_commands_dir, f'{n}.wav'))

In [19]:
# Training set summary
print('Training set summary')
for dir_ in os.listdir(training_set_dir):
    dir_examples_count = len(os.listdir(os.path.join(training_set_dir, dir_)))
    print(f'{dir_}: {dir_examples_count}')

Training set summary
seven: 1411
nine: 1144
four: 2400
one: 1276
off: 2244
left: 1485
zero: 1306
six: 1485
up: 1187
right: 1276
on: 2228
three: 1188
other: 18924
two: 902
.ipynb_checkpoints: 0
five: 1092
down: 1188
eight: 1113


In [20]:
# Dev set summary
print('Dev set summary')
for dir_ in os.listdir(dev_set_dir):
    dir_examples_count = len(os.listdir(os.path.join(dev_set_dir, dir_)))
    print(f'{dir_}: {dir_examples_count}')

Dev set summary
seven: 705
nine: 572
four: 1200
one: 638
off: 1122
left: 742
zero: 653
six: 742
up: 593
right: 638
on: 1114
three: 594
other: 9462
two: 451
five: 546
down: 594
eight: 556


In [21]:
# Test set summary
print('Dev set summary')
for dir_ in os.listdir(dev_set_dir):
    dir_examples_count = len(os.listdir(os.path.join(test_set_dir, dir_)))
    print(f'{dir_}: {dir_examples_count}')

Dev set summary
seven: 706
nine: 572
four: 1200
one: 638
off: 1122
left: 743
zero: 653
six: 743
up: 594
right: 638
on: 1114
three: 594
other: 9462
two: 451
five: 546
down: 594
eight: 557
