In [1]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.python.platform import gfile
from tensorflow.python.ops import io_ops
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio

batch_size = 100

training_step_list = [1,1]
learning_rate_list = [0.001,0.0001]

data_dir = "train/audio"
summary_dir = "logs" # where to save summary logs for Tensorboard
wanted_words = ["silence","unknown","yes","no","up","down","left","right","on","off","stop","go","silence"]

sample_rate = 16000 # per sec
clip_duration_ms = 1000 #

max_background_volume = 0.1 # how loud background noise should be, [0,1]
background_frequency = 0.8 # how many of training samps have noise added

silence_percentage = 10.0 # what percent of training data should be silence
unknown_percentage = 10.0 # what percent of training data should be unknown words

max_time_shift_ms = 100.0 # max range to randomly shift the training audio by time

window_size_ms = 30.0 # millisec length of frequency analysis window
window_stride_ms = 10.0

window_size_samples = int(1600 * window_size_ms / 1000)
window_stride_samples = int(1600 * window_stride_ms / 1000)

dct_coefficient_count = 40 # bins to use for MFCC fingerprint

percent_test = 10 # test set
percent_val = 10 # val set

def prepare_data_index():
    random.seed(111)
    wanted_words_index = {}
    for i, wanted_word in enumerate(wanted_words):
        wanted_words_index[wanted_word] = i + 2 # bc silence, unknown will be first

    data_index = {"val":[],"test":[],"train":[]}
    unknown_index = {"val":[],"test":[],"train":[]}

    all_words = {}

    for wav_path in gfile.Glob(data_dir + "/*/*.wav"):
        word = os.path.split(os.path.dirname(wav_path))[-1].lower()
        if word == "_background_noise_":
            continue # don't include yet

        all_words[word] = True

        set_name = which_set(wav_path)

        if word in wanted_words:
            data_index[set_name].append({"label":word,"file":wav_path})
        else:
            unknown_index[set_name].append({"label":word,"file":wav_path})

    silence_wav_path = data_index["train"][0]["file"] # arbitrary, to be used for silence
    for set_name in ['val','test','train']:
        # add silence to val, train, test
        set_size = len(data_index[set_name])
        silence_size = int(math.ceil(set_size * silence_percentage) / 100)
        for _ in range(silence_size):
            data_index[set_name].append({"label":"silence","file":silence_wav_path})

        # add unknown words to val, train, test
        random.shuffle(unknown_index[set_name])
        unknown_size = int(math.cel(set_size * unknown_percentage) / 100)
        data_index[set_name].extend(unknown_index[set_name][:unknown_size])

    # randomize each set
    for set_name in ['val','test','train']:
        random.shuffle(data_index[set_index])

    # get word2index mapping
    word2index = {}
    for word in all_words:
        if word in wanted_words_index:
            word2index[word] = wanted_words_index[word]
        else:
            word2index[word] = 1
    word2index["silence"] = 0

    return data_index, word2index

def prepare_test_data():
    test_dir = "test/audio"
    test_index = []
    for wav_path in gfile.Glob(test_dir + "/*.wav"):
        text_index.append(wav_path)
    return test_index



def which_set(wav_path):
    # split into train, test, val sets deterministically
    wav_name = re.sub(r'_nohas_.*$','',os.path.basename(wav_path)) # so that all samps from same user are grouped together
    hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
    MAX_NUM_WAVS_PER_CLASS = 2**27 - 1
    percentage_hash = ((int(hash_name_hashed, 16) %
                    (MAX_NUM_WAVS_PER_CLASS + 1)) *
                    (100.0 / MAX_NUM_WAVS_PER_CLASS))
    if percentage_hash < percent_val:
        result = 'val'
    elif percentage_hash < (percent_val + percent_test):
        result = 'test'
    else:
        result = 'train'
    return result


def prepare_background_data():
    bg_data = []
    bg_path = "/train/audio/_background_noise_/*.wav"
    with tf.Session(graph=tf.Graph()) as sess:
        wav_filename_ph = tf.placeholder(tf.string,[])
        wav_loader = io_ops.read_file(wav_filename_ph)
        wav_decoder = contrib_audio.decode_wav(wav_loader,desired_channels=1)
        for wav_path in gfile.Glob(bg_path):
            wav_data = sess.run(wav_decoder,feed_dict={wav_filename_ph:wav_path}).audio.flatten()
            bg_data.append(wav_data)
    return bg_data

def main():

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()

    data_index, word2index = prepare_data_index()
    bg_data = prepare_background_data()

    # Preprocessing Graph parts
    wav_filename_ph = tf.placeholder(tf.string,[])
    wav_loader = io_ops.read_file(wav_filename_ph)
    wav_decoder = contrib_audio.decode_wav(wav_loader,desired_channels=1,desired_samples=1600) # length of 1 sec

    volume_ph = tf.placeholder(tf.float32,[])

    # this seems weird, shouldn't it do some kind of conditional scaling??
    # nah, we're doing spectrograms bruh, plus this is just to zero out the volume for silence
    scaled_wav = tf.multiply(wav_decoder.audio, volume_ph)

    time_shift_padding_ph = tf.placeholder(tf.int32,[2,2])
    time_shift_offset_ph = tf.placeholder(tf.int32,[2])

    padded_wav = tf.pad(scaled_wav,time_shift_padding_ph,mode="CONSTANT")
    sliced_wav = tf.slice(padded_wav,time_shift_offset_ph,[1600,-1])

    bg_placeholder = tf.placeholder(tf.float32,[1600,1])
    bg_volume_ph = tf.placeholder(tf.float32,[])

    scaled_bg = tf.multiply(bg_placeholder,bg_volume_ph)

    wav_with_bg = tf.add(sliced_wav,scaled_bg)

    clamped_wav = tf.clip_by_value(wav_with_bg,-1.0,1.0)

    spectrogram = contrib_audio.audio_spectrogram(
        clamped_wav,
        window_size = window_size_samples,
        stride = window_stride_samples,
        magnitude_squared = True
    )

    mfcc = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count = dct_coefficient_count
    )

    mfcc_height = mfcc.shape[1]
    mfcc_width = mfcc.shape[2]

    fingerprint_ph = tf.placeholder(tf.float32,[None,mfcc_height,mfcc_width])



    for steps, lrate in zip(training_step_list,learning_rate_list):
        for i in range(steps):
            data = np.zeros((batch_size,mfcc_height,mfcc_width))
            labels = np.zeros((batch_size,len(wanted_words)))
            for j in batch_size:
                samp_index = np.random.randint(len(data_index["train"]))
                samp_data = data_index["train"][samp_index]

                time_shift_amount = np.random.randint(-max_time_shift_ms,max_time_shift_ms)
                if time_shift_amount > 0:
                    time_shift_padding = [[time_shift_amount,0],[0,0]]
                    time_shift_offset = [0,0]
                else:
                    time_shift_padding = [[0,-time_shift_amount],[0,0]]
                    time_shift_offset = [-time_shift_amount,0]

                bg_index = np.random.randint(len(bg_data))
                bg_samp = bg_data[bg_index]
                bg_offset = np.random.randint(0,len(bg_samp) - 1600)
                bg_sliced = bg_samp[bg_offset:(bg_offset + 1600)]
                bg_sliced = bg_sliced.reshape(1600,-1)
                if np.random.uniform(0,1) < background_frequency:
                    bg_volume = np.random.uniform(0,max_background_volume)
                else:
                    bg_volume = 0

                if samp_data["label"] == "silence":
                    volume = 0 # zero out the foreground for the silence labels
                else:
                    volume = 1

                data[j,:] = sess.run(mfcc,feed_dict={
                    wav_filename_ph: samp_data["file"],
                    volume_ph: volume,
                    time_shift_padding_ph: time_shift_padding,
                    time_shift_offset_ph: time_shift_offset,
                    background_ph: bg_sliced,
                    bg_volume_ph: bg_volume
                }).flatten()
                labels[j,word2index[samp_data["label"]]] = 1
                print(data)


            # now here's where we run the real, convnet part
#             stuff = sess.run([bunch_of_stuff],feed_dict={

#             })

    # now here's where we run the test classification
    test_index = prepare_test_data()



In [2]:
data

NameError: name 'data' is not defined