## Load the LibriSpeech Dataset
The LibriSpeech dataset consists of folders nested two deep, the inner folders containing a text file, each line a different utterance, and the associated utterances as FLAC files.

In [2]:
import os

librispeech_dir = "/Users/Noah/Downloads/LibriSpeech/dev-clean"

if not os.path.exists(librispeech_dir):
    raise OSError("LibriSpeech directory not found")
    

directories = [os.path.join(librispeech_dir, d) for d in os.listdir(librispeech_dir) 
               if os.path.isdir(os.path.join(librispeech_dir, d))]

audiofiles = []

for directory in directories:
    sub_directories = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
        
    for sub_directory in sub_directories:
        trans_file = open(os.path.join(directory, sub_directory) 
                          + "/" + os.path.basename(directory) + "-" + sub_directory + ".trans.txt", 'r')
        
        for line in trans_file:
            parts = line[:-1].split(" ", 1)
            
            filename = os.path.join(directory, sub_directory) + "/" + parts[0] + ".flac"
            text = parts[1]
            
            audiofiles.append((filename, text))

## Convert the FLAC files to visual representations
Use librosa to get the log-scaled spectrogram?

In [12]:
#todo
import librosa
import numpy as np

alphabet = "_ABCDEFGHIJKLMNOPQRSTUVWXYZ' "
letter_to_index = dict()
for i in range(0, len(alphabet)):
    letter_to_index[alphabet[i]] = i

index_and_spectrograms = []
for audiofile, text in audiofiles:
    
    # Generate MFSC from audio file
    time_series, sampling_rate = librosa.load(audiofile)
    mfsc = librosa.feature.melspectrogram(y = time_series, sr = sampling_rate)
    
    # Convert text into indexes in the alphabet
    text_list = []
    for l in text:
        text_list.append(letter_to_index[l])
    
    index_and_spectrograms.append((text_list, mfsc))

## Build the Network Architecture
The network takes in audio spectrograms, applies a series of convolutional layers, and then feeds the result into a series of reccurent layers (either vanilla RNN or GRU). The last layers are fully connected. It uses CTC loss. Batch normalization and dropout are used throughout.

Current Plan: 1 conv, 3 recurrent, 1 fc

In [47]:
import tensorflow as tf

conv_architecture = []
rnn_architecture = []

# These functions create weight and bias variables appropriate for CNN's using ReLU.
def cnn_weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.05)
    return tf.Variable(initial)

def cnn_bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)