In [72]:
%load_ext autoreload
%autoreload 2
%pylab inline
import pandas
import os
os.sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


## Speech Command Dataset
Twenty core command words were recorded, with most speakers saying each
of them five times. 

The core words are "Yes", "No", "Up", "Down", "Left",
"Right", "On", "Off", "Stop", "Go", "Zero", "One", "Two", "Three", "Four",
"Five", "Six", "Seven", "Eight", and "Nine". To help distinguish unrecognized
words, 

there are also ten auxiliary words, which most speakers only said once.
These include "Bed", "Bird", "Cat", "Dog", "Happy", "House", "Marvin", "Sheila",
"Tree", and "Wow"

In [88]:
cnt = 0
root = "/home/muncok/DL/dataset/speech_commands/"
wav_files = {}
all_spks = []
for dirpath, _, filenames in os.walk(root):
    if dirpath == root or '_background_noise_' in dirpath: continue
    for name in filenames:
        sent = dirpath.split('/')[-1]
        spk,_,seqID = name.rstrip('.wav').split('_')
        uniqID = sent+spk+seqID
        all_spks.append(spk)
        wav_files[uniqID] = (spk, sent, name)
        cnt += 1
all_spks = list(set(all_spks))

In [155]:
df = pandas.DataFrame.from_dict(wav_files, 'index')

In [165]:
# set df's column names
df.columns = ['spk', 'sent', 'file']

In [159]:
# read set devision lists
test_list = []
with open("/home/muncok/DL/dataset/speech_commands/testing_list.txt", 'r') as f:
    for line in f.readlines():
        sent, file = line.rstrip('.wav\n').split('/')
        spk,_,seqID = file.split('_')
        uniqID = sent+spk+seqID
        test_list.append(uniqID)

val_list = []
with open("/home/muncok/DL/dataset/speech_commands/validation_list.txt", 'r') as f:
    for line in f.readlines():
        sent, file = line.rstrip('.wav\n').split('/')
        spk,_,seqID = file.split('_')
        uniqID = sent+spk+seqID
        val_list.append(uniqID)

In [160]:
set_columns = []
for idx in df.index:
    if idx in test_list:
        set_columns.append('test')
    elif idx in val_list:
        set_columns.append('val')
    else:
        set_columns.append('train')      

In [161]:
# add set column
df = df.assign(set = set_columns)

In [167]:
# test if train, val, and test are disjoint along spk with same sent
all_sents = [
             "Yes", "No", "Up", "Down", "Left",
             "Right", "On", "Off", "Stop", "Go", "Zero", "One", "Two", "Three", "Four",
             "Five", "Six", "Seven", "Eight", "Nine",
             "Bed", "Bird", "Cat", "Dog", "Happy", "House", "Marvin", "Sheila",
             "Tree","Wow"
            ]
all_sents = list(map(lambda x: x.lower(), all_sents))

for sent in all_sents:
    train_sent = df[(df.sent == sent) & (df.set=='train')].spk
    val_sent = df[(df.sent == sent) & (df.set=='val')].spk
    test_sent = df[(df.sent == sent) & (df.set=='test')].spk
    assert not set(train_sent) & set(val_sent) &set(test_sent)

In [181]:
print("number of speakers: {}".format(len(all_spks)))
print("kinds of sents: {}".format(len(all_sents)))

number of speakers: 1881
kinds of sents: 30


### KWS 

In [176]:
target_sent = 'on'
excluded_spks = df[(df.sent == target_sent)].spk.value_counts().index[:10]

In [182]:
import random
# backg_spks = random.choices(list(set(all_spks) - set(target_spks)), k=100)
backg_spks = list(set(all_spks) - set(excluded_spks))
backg_speechs = df[(df.spk.isin(backg_spks))]

In [183]:
backg_train = backg_speechs[backg_speechs.set == 'train']
backg_val = backg_speechs[backg_speechs.set == 'val']
backg_test = backg_speechs[backg_speechs.set == 'test']

In [196]:
import csv
valid_sent = all_sents.index(target_sent) # only core keywords valid 
unkown_prob = 0.1
unkown_files = []
tags = ['train', 'val', 'test']
sets = {'train':backg_train, 'val':backg_val, 'test':backg_test}
for tag in tags:
    samples = []
    with open('kws_command_{}_manifest.csv'.format(tag), 'w') as f:
        for index, row in sets[tag].iterrows():
            file_path = os.path.join(root, row.sent, row.file)
            label = all_sents.index(row.sent) # 0,1 for speacial purpose
            sample = ','.join([file_path, str(0)])
            if label != valid_sent:
                sample = ','.join([file_path, str(1)])
                unkown_files.append(sample)
            else:
                sample = ','.join([file_path, str(2)])
                samples.append(sample)
        nb_unkowns = int(len(samples) * unkown_prob)
        random.shuffle(unkown_files)
        for _ in range(nb_unkowns):
            samples.append(unkown_files.pop())
        random.shuffle(samples)
        writer = csv.writer(f, delimiter='\n', quoting=csv.QUOTE_NONE)
        writer.writerow(samples)

### SV

In [236]:
# speech per a speaker
speech_counts = backg_speechs['spk'].groupby(backg_speechs.spk).agg(['count']).sort_values('count', ascending=False)

In [202]:
sv_speechs = backg_speechs.sample(frac=1.0)
length = len(sv_speechs)
bound = int(0.1 *length)
sv_test = sv_speechs[:bound]
sv_val = sv_speechs[bound:2*bound]
sv_train = sv_speechs[2*bound:]
speakers = pandas.concat([sv_test.spk ,sv_train.spk, sv_val.spk]).unique()
valid_spk = 1000
unkown_prob = 0.1

In [None]:
# restricting the sentences
sent_counts = df.sent.value_counts()
chosen_sent = sent_counts.index[:20].tolist()
sv_test = df[(df.sent.isin(chosen_sent)) & (df.set == 'test')]
sv_val = df[(df.sent.isin(chosen_sent)) & (df.set == 'val')]
sv_train = df[(df.sent.isin(chosen_sent)) & (df.set == 'train')]
speakers = pandas.concat([sv_test.spk ,sv_train.spk, sv_val.spk]).unique().tolist()
valid_spk = np.inf
unkown_prob = 0.0

In [313]:
import csv
unkown_files = []
tags = ['train', 'val', 'test']
sets = {'train':sv_train, 'val':sv_val, 'test':sv_test}
for tag in tags:
    samples = []
    with open('sv_command_{}_manifest.csv'.format(tag), 'w') as f:
        for index, row in sets[tag].iterrows():
            file_path = os.path.join(root, row.sent, row.file)
            label = speakers.index(row.spk) 
            if label > valid_spk - 1:
                sample = ','.join([file_path, str(1)])
                unkown_files.append(sample)
            else:
                sample = ','.join([file_path, str(label+2)]) # 0,1 for speacial purpose
                samples.append(sample)
        nb_unkowns = int(len(samples) * unkown_prob)
        random.shuffle(unkown_files)
        for _ in range(nb_unkowns):
            samples.append(unkown_files.pop())
        random.shuffle(samples)
        writer = csv.writer(f, delimiter='\n', quoting=csv.QUOTE_NONE)
        writer.writerow(samples)

### KWS+SV

In [258]:
target_sent = "on"
target_sent_label = all_sents.index(target_sent)
# target_spks = random.choices(spks_descending[:valid_spk], k=5) # relaxed condition
target_spks = excluded_spks.tolist()
target_spks_speech = df[df.spk.isin(target_spks)] # not used in training

In [259]:
enroll_samples = []
for spk in target_spks:
    enroll_samples.append(target_spks_speech.sample(5))

enroll_samples = pandas.concat(enroll_samples)

import csv
samples = []
with open('system_enroll_manifest.csv'.format(tag), 'w') as f:
    for index, row in enroll_samples.iterrows():
        file_path = os.path.join(root, row.sent, row.file)
        label = target_spks.index(row.spk)
        sample = ','.join([file_path, str(label)])
        samples.append(sample)
    random.shuffle(samples)
    writer = csv.writer(f, delimiter='\n', quoting=csv.QUOTE_NONE)
    writer.writerow(samples)

In [267]:
pos_samples = target_spks_speech[(target_spks_speech.sent == target_sent) & 
                                 (~target_spks_speech.index.isin(enroll_samples.index))]
nb_pos_samples = len(pos_samples)

In [274]:
IC_samples = df[(df.set == 'test') & (~df.spk.isin(target_spks)) & (df.sent == target_sent)].sample(nb_pos_samples)
IW_samples = df[(df.set == 'test') & (df.spk.isin(target_spks)) & (df.sent != target_sent) ].sample(nb_pos_samples)
TW_samples = df[(df.set == 'test') & (~df.spk.isin(target_spks)) & (df.sent != target_sent)].sample(nb_pos_samples)

In [275]:
set(pos_samples.index) & set(IC_samples.index) & set(IW_samples.index) & set(TW_samples.index)

set()

In [276]:
neg_samples = pandas.concat([IC_samples, IW_samples, TW_samples])

In [277]:
import csv
samples = []
with open('system_test_manifest.csv'.format(tag), 'w') as f:
    for index, row in pos_samples.iterrows():
        file_path = os.path.join(root, row.sent, row.file)
        label = 1 # positive sample
        sample = ','.join([file_path, str(label)])
        samples.append(sample)
    for index, row in neg_samples.iterrows():
        file_path = os.path.join(root, row.sent, row.file)
        label = 0 # negative sample, unknown
        sample = ','.join([file_path, str(label)])
        samples.append(sample)
    random.shuffle(samples)
    writer = csv.writer(f, delimiter='\n', quoting=csv.QUOTE_NONE)
    writer.writerow(samples)