In [98]:
import os, glob, re, math, random
import time
from scipy.io import wavfile
import cPickle as pickle
from python_speech_features import logfbank

#### load all the wav files

In [6]:
train_main_path = '/Users/matt.meng/data/speech_competition/train/audio'
wav_files = glob.glob(os.path.join(train_main_path, "*", "*.wav"))

In [24]:
wav_files[21]

'/Users/matt.meng/data/speech_competition/train/audio/bed/01b4757a_nohash_1.wav'

#### check the data format

In [22]:
hash_name = re.sub(r'_nohash_.*$', '', wav_files[20])

In [34]:
wav_files[26]

'/Users/matt.meng/data/speech_competition/train/audio/bed/035de8fe_nohash_0.wav'

In [42]:
hash_name = re.search('/([^/]+)sda', wav_files[26])

In [43]:
hash_name

#### categorize the file paths

In [61]:
def check_wav_files(wav_files, expected_sample_rate=16000, expected_sample_length=16000):
    word_dict = {}
    for wav_file in wav_files:
        sample_rate, samples = wavfile.read(wav_files[20])
        word = re.search('.*/([^/]+)/.*.wav', wav_file).group(1).lower()
        word_dict[word] = word_dict.get(word, 0) + 1
        if sample_rate != expected_sample_rate:
            print('for word {} at {}, the sample rate is different'.format(word, sample_rate))
        if len(samples) != expected_sample_length:
            print('for word {} at {}, the sample is different'.format(word, len(samples)))
    return word_dict


def categorize_wav_files_by_label(wav_files):
    '''
    categorize the wave file paths by label and hash.  `_background_noise_` does not have hash,
    use the file name.
    '''
    categorized_wav_files = {}
    categorized_sample_num = {}
    for wav_file in wav_files:
        label = re.search('.*/([^/]+)/.*.wav', wav_file).group(1).lower()
        categorized_sample_num[label] = categorized_sample_num.get(label, 0) + 1
        if label == '_background_noise_':
            hash_name = re.search('/([^/]+).wav', wav_file).group(1).lower()
        else:
            hash_name = re.search('/([^/]+)_nohash', wav_file).group(1).lower()

        if label not in categorized_wav_files:
            categorized_wav_files[label] = {}
            categorized_wav_files[label][hash_name] = [wav_file]
        else:
            if hash_name not in categorized_wav_files[label]:
                categorized_wav_files[label][hash_name] = [wav_file]
            else:
                categorized_wav_files[label][hash_name].append(wav_file)
    return categorized_wav_files, categorized_sample_num

In [62]:
#word_dict_ = check_wav_files(wav_files)
categorized_wav_files_, categorized_sample_num_ = categorize_wav_files_by_label(wav_files)

In [54]:
print categorized_wav_files_.keys()

['sheila', 'seven', 'right', 'up', 'house', 'one', 'four', 'zero', 'go', 'yes', 'down', 'no', 'wow', 'six', 'three', 'bird', 'happy', 'marvin', 'stop', 'eight', '_background_noise_', 'on', 'off', 'tree', 'dog', 'bed', 'cat', 'nine', 'five', 'two', 'left']


In [57]:
known_labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence']
unknown_label = 'unknown'
label2index, index2label = {}, {}
for i, label in enumerate(known_labels):
    label2index[label] = i
    index2label[i] = label
label2index[unknown_label] = len(known_labels)
index2label[len(known_labels)] = unknown_label

for label in categorized_wav_files_:
    if label in known_labels or label == unknown_label:
        continue
    label2index[label] = label2index[unknown_label]
    
training_percentage = 0.7
validate_percentage = 0.15
test_percentage = 0.15

assert (training_percentage + validate_percentage + test_percentage) == 1.

In [91]:
print label2index, '\n', index2label

{'sheila': 11, 'two': 11, 'seven': 11, 'right': 5, 'house': 11, 'one': 11, 'four': 11, 'zero': 11, 'go': 9, 'yes': 0, 'down': 3, 'no': 1, 'unknown': 11, 'wow': 11, 'six': 11, 'three': 11, 'bird': 11, 'happy': 11, 'marvin': 11, 'stop': 8, 'eight': 11, '_background_noise_': 11, 'on': 6, 'off': 7, 'dog': 11, 'tree': 11, 'up': 2, 'bed': 11, 'cat': 11, 'nine': 11, 'five': 11, 'silence': 10, 'left': 4} 
{0: 'yes', 1: 'no', 2: 'up', 3: 'down', 4: 'left', 5: 'right', 6: 'on', 7: 'off', 8: 'stop', 9: 'go', 10: 'silence', 11: 'unknown'}


In [93]:
categorized_sample_num_

{'_background_noise_': 6,
 'bed': 1713,
 'bird': 1731,
 'cat': 1733,
 'dog': 1746,
 'down': 2359,
 'eight': 2352,
 'five': 2357,
 'four': 2372,
 'go': 2372,
 'happy': 1742,
 'house': 1750,
 'left': 2353,
 'marvin': 1746,
 'nine': 2364,
 'no': 2375,
 'off': 2357,
 'on': 2367,
 'one': 2370,
 'right': 2367,
 'seven': 2377,
 'sheila': 1734,
 'six': 2369,
 'stop': 2380,
 'three': 2356,
 'tree': 1733,
 'two': 2373,
 'up': 2375,
 'wow': 1745,
 'yes': 2377,
 'zero': 2376}

#### split data into sets

In [74]:
def generate_proportional_data_sets(categorized_wav_files, categorized_sample_num, training_percentage=0.7, test_percentage=0.15, validate_percentage=0.15):
    excluded_category = ['_background_noise_']
    total_file_num = sum([categorized_sample_num[key] for key in categorized_sample_num if key not in excluded_category])
    training_samples, test_samples, validate_samles = [], [], []
    for category in categorized_wav_files.keys(): 
        if category in excluded_category:
            continue
        tot_training_samples = math.ceil(training_percentage * categorized_sample_num[category])
        tot_validate_samples = math.ceil(validate_percentage * categorized_sample_num[category])
        count = 0
        for hash_name in categorized_wav_files[category]:
            if count < tot_training_samples:
                for wave_file in categorized_wav_files[category][hash_name]:
                    training_samples.append((wave_file, label2index[category]))
                    count += 1
            elif count < (tot_training_samples + tot_validate_samples):
                for wave_file in categorized_wav_files[category][hash_name]:
                    validate_samles.append((wave_file, label2index[category]))
                    count += 1
            else:
                for wave_file in categorized_wav_files[category][hash_name]:
                    test_samples.append((wave_file, label2index[category]))
                    count += 1
    return training_samples, test_samples, validate_samles




In [75]:
training_samples_, test_samples_, validate_samles_ = generate_proportional_data_sets(categorized_wav_files_, categorized_sample_num_)

In [79]:
print len(training_samples_), len(test_samples_), len(validate_samles_)

45331 9659 9731


In [84]:
1. * (9659 - 9731) / 9659

-0.007454187804120509

In [None]:
os.path.join

In [115]:
def split_data_into_chunks(wav_files, chunk_size=2000, prefix='training', data_path='/Users/matt.meng/data/speech_competition/processed_data'):
    random.shuffle(wav_files)
    chunk_num = math.ceil(1. * len(wav_files) / chunk_size)
    chunk_data, chunk_counter, sample_counter = [], 0, 0
    start_time = time.time()
    chunk_start_time = start_time
    def dump_chunk_data(chunk_data_, chunk_counter, chunk_start_time):
        with open(os.path.join(data_path, 'speech_{}_{}.pkl'.format(prefix, chunk_counter)), 'wb') as f:
            pickle.dump(chunk_data_, f)
        print('finish processing {} raw audio waveforms using {:.2f} seconds'.format(i, (time.time()-chunk_start_time)))
    
    for i in range(len(wav_files)):
        if sample_counter >= chunk_size:
            dump_chunk_data(chunk_data, chunk_counter, chunk_start_time)
            chunk_data, sample_counter, chunk_start_time = [], 0, time.time()
            chunk_counter += 1
        sample_rate, samples = wavfile.read(wav_files[i][0])
        fbank_features = logfbank(samples, sample_rate)
        chunk_data.append((fbank_features, wav_files[i][1]))
        sample_counter += 1
    if len(chunk_data) > 0:
        dump_chunk_data(chunk_data, chunk_counter, chunk_start_time)

    print('processed all {} wav_files using {:.2f} seconds'.format(len(wav_files), (time.time()-start_time)))

In [116]:
split_data_into_chunks(training_samples_)

finish processing 2000 raw audio waveforms using 7.30 seconds
finish processing 4000 raw audio waveforms using 7.07 seconds
finish processing 6000 raw audio waveforms using 7.01 seconds
finish processing 8000 raw audio waveforms using 7.17 seconds
finish processing 10000 raw audio waveforms using 7.04 seconds
finish processing 12000 raw audio waveforms using 6.96 seconds
finish processing 14000 raw audio waveforms using 7.04 seconds
finish processing 16000 raw audio waveforms using 7.12 seconds
finish processing 18000 raw audio waveforms using 7.01 seconds
finish processing 20000 raw audio waveforms using 6.97 seconds
finish processing 22000 raw audio waveforms using 6.97 seconds
finish processing 24000 raw audio waveforms using 7.15 seconds
finish processing 26000 raw audio waveforms using 7.08 seconds
finish processing 28000 raw audio waveforms using 7.17 seconds
finish processing 30000 raw audio waveforms using 7.09 seconds
finish processing 32000 raw audio waveforms using 7.14 seco