Merge of pull requests #49, #50, and #52. Fixes issues #2, #4, #11, #12…

…, #46, #47, and #48 Merge of pull requests #49, #50, and #52. Fixes issues #2, #4, #11, #12, #46, #47, and #48
mozilla · Oct 13, 2016 · 84c030a · 84c030a
2 parents aeb08b3 + a3abc9d
commit 84c030a
Show file tree

Hide file tree

Showing 13 changed files with 1,221 additions and 334 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@
 *.pyc
 .DS_Store
 /logs
+/data/ted/TEDLIUM_release2
+/data/ted/TEDLIUM_release2.tar.gz
diff --git a/DeepSpeech.ipynb b/DeepSpeech.ipynb
diff --git a/data/ted/.gitkeep b/data/ted/.gitkeep
diff --git a/util/audio.py b/util/audio.py
@@ -0,0 +1,68 @@
+import numpy as np
+import scipy.io.wavfile as wav
+
+from python_speech_features import mfcc
+
+def audiofile_to_input_vector(audio_filename, numcep, numcontext):
+    # Load wav files
+    fs, audio = wav.read(audio_filename)
+
+    # Get mfcc coefficients
+    orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
+
+    # For each time slice of the training set, we need to copy the context this makes
+    # the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
+    # because of:
+    #  - numcep dimensions for the current mfcc feature set
+    #  - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
+    # => so numcep + 2*numcontext*numcep
+    train_inputs = np.array([], np.float32)
+    train_inputs.resize((orig_inputs.shape[0], numcep + 2*numcep*numcontext))
+
+    # Prepare pre-fix post fix context (TODO: Fill empty_mfcc with MCFF of silence)
+    empty_mfcc = np.array([])
+    empty_mfcc.resize((numcep))
+
+    # Prepare train_inputs with past and future contexts
+    time_slices = range(train_inputs.shape[0])
+    context_past_min   = time_slices[0]  + numcontext
+    context_future_max = time_slices[-1] - numcontext
+    for time_slice in time_slices:
+        ### Reminder: array[start:stop:step]
+        ### slices from indice |start| up to |stop| (not included), every |step|
+        # Pick up to numcontext time slices in the past, and complete with empty
+        # mfcc features
+        need_empty_past     = max(0, (context_past_min - time_slice))
+        empty_source_past   = list(empty_mfcc for empty_slots in range(need_empty_past))
+        data_source_past    = orig_inputs[max(0, time_slice - numcontext):time_slice]
+        assert(len(empty_source_past) + len(data_source_past) == numcontext)
+
+        # Pick up to numcontext time slices in the future, and complete with empty
+        # mfcc features
+        need_empty_future   = max(0, (time_slice - context_future_max))
+        empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
+        data_source_future  = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
+        assert(len(empty_source_future) + len(data_source_future) == numcontext)
+
+        if need_empty_past:
+            past   = np.concatenate((empty_source_past, data_source_past))
+        else:
+            past   = data_source_past
+
+        if need_empty_future:
+            future = np.concatenate((data_source_future, empty_source_future))
+        else:
+            future = data_source_future
+
+        past   = np.reshape(past, numcontext*numcep)
+        now    = orig_inputs[time_slice]
+        future = np.reshape(future, numcontext*numcep)
+
+        train_inputs[time_slice] = np.concatenate((past, now, future))
+        assert(len(train_inputs[time_slice]) == numcep + 2*numcep*numcontext)
+
+    # Whiten inputs (TODO: Should we whiten)
+    train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
+
+    # Return results
+    return train_inputs
diff --git a/util/audio/__init__.py b/util/audio/__init__.py
diff --git a/util/gpu/__init__.py → util/gpu.py b/util/gpu/__init__.py → util/gpu.py