In [1]:
# https://github.com/igormq/ctc_tensorflow_example/blob/master/ctc_tensorflow_example.py

#  Compatibility imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time

import tensorflow as tf
import scipy.io.wavfile as wav
import numpy as np
import glob

from six.moves import xrange as range

import soundfile as sf

In [2]:
try:
    from python_speech_features import mfcc
except ImportError:
    print("Failed to import python_speech_features.\n Try pip install python_speech_features.")
    raise ImportError

from utils import maybe_download as maybe_download
from utils import sparse_tuple_from as sparse_tuple_from

from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
# Constants
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1  # 0 is reserved to space

# Some configs
num_features = 13
num_units=50 # Number of units in the LSTM cell
# Accounting the 0th indice +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 200
num_hidden = 50
num_layers = 2
batch_size = 1
initial_learning_rate = 1e-2
momentum = 0.9

num_examples = 1
num_batches_per_epoch = int(num_examples/batch_size)

In [None]:
ROOT_DIRECTORY = 'TIMIT_full/'

WAV_CLASS = '*.wav'
TXT_CLASS = '*.txt'

In [None]:
Clean

In [None]:
TRAIN_DIRECTORY = 'train_16k/'
VAL_DIRECTORY = 'val_16k/'
TEST_DIRECTORY = 'test_16k/'

In [None]:
Noisy

In [4]:
TRAIN_DIRECTORY = 'timit_train_16k/'
VAL_DIRECTORY = 'timit_val_16k/'
TEST_DIRECTORY = 'timit_test_16k/'

In [5]:
def load_dataset(wav_path,txt_path):

    x_s = []
    y_s = []
    text_s = []
    print(wav_path, txt_path)
    for e, (f1, f2) in enumerate( zip(sorted(glob.glob(wav_path)),sorted(glob.glob(txt_path))) ) :
#         print('FileName:',f1, f2)
        
        ##### Read Audio features #####
        ###############################
        audio, fs = sf.read(f1)
        inputs = mfcc(audio, samplerate=fs)
        
        # Tranform in 3D array
        train_inputs = np.asarray(inputs[np.newaxis, :])
        train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
#         print('train_inputs.shape:', train_inputs.shape)
        
        train_seq_len = [train_inputs.shape[1]]
#         print('train_seq_len.len:', len(train_seq_len) )
        
        x_s.append(train_inputs)
#         print('x_s.len', len(x_s))
        
        ##### Read Labels features #####
        ###############################
        with open(f2, 'r') as txt_f:
            line = txt_f.readlines()[-1] #Only the last line is necessary

            targets = preprocess_line(line)
            text_s.append(targets)

            # Adding blank label
            targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])

            # Transform char into index
            targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                                  for x in targets])

            # Creating sparse representation to feed the placeholder
            train_targets = sparse_tuple_from([targets])
#             print('train_targets.shape:', len(train_targets))
            y_s.append(train_targets)
              
    
    print('x_s.len', len(x_s),', [-1]x_s.len', x_s[-1].shape)
#     x_s = pad_sequences(x_s, maxlen=500, dtype='float', padding='post', truncating='post')
#     print('x_s.shape', (x_s.shape))
        
    
    return x_s, y_s,text_s
        

In [6]:
def preprocess_line(line):
    
    original = ' '.join(line.strip().lower().split(' ')[2:])
    for c in set(original):
        if c != ' ' and not c.isalpha():
            original = original.replace(c,'')
        
    targets = original.replace(' ', '  ')
    targets = targets.split(' ')

    i = 0
    while i<len(targets)-1:
        if targets[i]==targets[i+1] and targets[i+1]=='':
            del targets[i+1]
        else:
            i +=1
            
    return targets
    
    # Get only the words between [a-z] and replace period for none
#     original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '').replace('\'','')
#     targets = original.replace(' ', '  ')
#     return targets.split(' ')

In [56]:
 preprocess_line("343 343 I is't 45 number roll :D. ")

i is't 45 number roll :d.
e
l
d
m
b
'
:
t
4
u
s
i
n
.
o
r
 
5
i  ist    number  roll  d


['i', '', 'ist', '', 'number', '', 'roll', '', 'd']

In [7]:
x_, y_,text_ = load_dataset(ROOT_DIRECTORY+TRAIN_DIRECTORY+WAV_CLASS, ROOT_DIRECTORY+TRAIN_DIRECTORY+TXT_CLASS)  
print(len(x_), len(y_))

TIMIT_full/train_16k/*.wav TIMIT_full/train_16k/*.txt
x_s.len 4576 , [-1]x_s.len (1, 204, 13)
4576 4576


In [8]:
val_x_, val_y_, val_text_ = load_dataset(ROOT_DIRECTORY+VAL_DIRECTORY+WAV_CLASS, ROOT_DIRECTORY+VAL_DIRECTORY+TXT_CLASS)  
print(len(val_x_), len(val_y_))

TIMIT_full/val_16k/*.wav TIMIT_full/val_16k/*.txt
x_s.len 44 , [-1]x_s.len (1, 194, 13)
44 44


In [9]:
test_x_, test_y_, test_text_ = load_dataset(ROOT_DIRECTORY+TEST_DIRECTORY+WAV_CLASS, ROOT_DIRECTORY+TEST_DIRECTORY+TXT_CLASS)  
print(len(test_x_), len(test_y_))

TIMIT_full/test_16k/*.wav TIMIT_full/test_16k/*.txt
x_s.len 1680 , [-1]x_s.len (1, 315, 13)
1680 1680


In [10]:
# Loading the data

# audio_filename = maybe_download('LDC93S1.wav', 93638)
# target_filename = maybe_download('LDC93S1.txt', 62)

audio_filename = 'sa2.wav'
target_filename ='sa2.txt'

fs, audio = wav.read(audio_filename)

inputs = mfcc(audio, samplerate=fs)
# Tranform in 3D array
train_inputs = np.asarray(inputs[np.newaxis, :])
train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
train_seq_len = [train_inputs.shape[1]]

In [11]:
# Readings targets
with open(target_filename, 'r') as f:

    #Only the last line is necessary
    line = f.readlines()[-1]

    # Get only the words between [a-z] and replace period for none
    original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '')
    targets = original.replace(' ', '  ')
    targets = targets.split(' ')

In [60]:
targets

["don't",
 '',
 'ask',
 '',
 'me',
 '',
 'to',
 '',
 'carry',
 '',
 'an',
 '',
 'oily',
 '',
 'rag',
 '',
 'like',
 '',
 'that']

In [61]:
# Adding blank label
targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])
print('targets:', targets)

# Transform char into index
targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                      for x in targets])
print('targets:', targets)

# Creating sparse representation to feed the placeholder
train_targets = sparse_tuple_from([targets])

targets: ['d' 'o' 'n' "'" 't' '<space>' 'a' 's' 'k' '<space>' 'm' 'e' '<space>' 't'
 'o' '<space>' 'c' 'a' 'r' 'r' 'y' '<space>' 'a' 'n' '<space>' 'o' 'i' 'l'
 'y' '<space>' 'r' 'a' 'g' '<space>' 'l' 'i' 'k' 'e' '<space>' 't' 'h' 'a'
 't']
targets: [  4  15  14 -57  20   0   1  19  11   0  13   5   0  20  15   0   3   1
  18  18  25   0   1  14   0  15   9  12  25   0  18   1   7   0  12   9
  11   5   0  20   8   1  20]


In [57]:
targets

array([  4,  15,  14, -57,  20,   0,   1,  19,  11,   0,  13,   5,   0,
        20,  15,   0,   3,   1,  18,  18,  25,   0,   1,  14,   0,  15,
         9,  12,  25,   0,  18,   1,   7,   0,  12,   9,  11,   5,   0,
        20,   8,   1,  20])

In [35]:
# We don't have a validation dataset :(
val_inputs, val_targets, val_seq_len = train_inputs, train_targets, \
                                       train_seq_len

In [10]:
# THE MAIN CODE! LSTM

graph = tf.Graph()
with graph.as_default():
    # e.g: log filter bank or MFCC features
    # Has size [batch_size, max_stepsize, num_features], but the
    # batch_size and max_stepsize can vary along each step
    inputs = tf.placeholder(tf.float32, [None, None, num_features])

    # Here we use sparse_placeholder that will generate a
    # SparseTensor required by ctc_loss op.
    targets = tf.sparse_placeholder(tf.int32)

    # 1d array of size [batch_size]
    seq_len = tf.placeholder(tf.int32, [None])

    # Defining the cell
    # Can be:
    #   tf.nn.rnn_cell.RNNCell
    #   tf.nn.rnn_cell.GRUCell 
    cells = []
    for _ in range(num_layers):
        cell = tf.contrib.rnn.LSTMCell(num_units)  # Or LSTMCell(num_units)
        cells.append(cell)
    stack = tf.contrib.rnn.MultiRNNCell(cells)

    # The second output is the last state and we will no use that
    outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)

    shape = tf.shape(inputs)
    batch_s, max_timesteps = shape[0], shape[1]

    # Reshaping to apply the same weights over the timesteps
    outputs = tf.reshape(outputs, [-1, num_hidden])

    # Truncated normal with mean 0 and stdev=0.1
    # Tip: Try another initialization
    # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
    W = tf.Variable(tf.truncated_normal([num_hidden,
                                         num_classes],
                                        stddev=0.1))
    # Zero initialization
    # Tip: Is tf.zeros_initializer the same?
    b = tf.Variable(tf.constant(0., shape=[num_classes]))

    # Doing the affine projection
    logits = tf.matmul(outputs, W) + b

    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Time major
    logits = tf.transpose(logits, (1, 0, 2))

    loss = tf.nn.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.MomentumOptimizer(initial_learning_rate,
                                           0.9).minimize(cost)

    # Option 2: tf.nn.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)

    # Inaccuracy: label error rate
    ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          targets))

In [15]:
# THE MAIN CODE! BiLSTM

graph = tf.Graph()
with graph.as_default():
    # e.g: log filter bank or MFCC features
    # Has size [batch_size, max_stepsize, num_features], but the
    # batch_size and max_stepsize can vary along each step
    inputs = tf.placeholder(tf.float32, [None, None, num_features])

    # Here we use sparse_placeholder that will generate a
    # SparseTensor required by ctc_loss op.
    targets = tf.sparse_placeholder(tf.int32)

    # 1d array of size [batch_size]
    seq_len = tf.placeholder(tf.int32, [None])

    # Defining the cell
    # Can be:
    #   tf.nn.rnn_cell.RNNCell
    #   tf.nn.rnn_cell.GRUCell 
    cells = []
    for _ in range(num_layers):
        cell = tf.contrib.rnn.GRUCell(num_units)  # Or LSTMCell(num_units)
        cells.append(cell)
    stack = tf.contrib.rnn.MultiRNNCell(cells)
    
    cells_bw = []
    for _ in range(num_layers):
        cell_bw = tf.contrib.rnn.GRUCell(num_units)  # Or LSTMCell(num_units)
        cells_bw.append(cell_bw)
    stack_bw = tf.contrib.rnn.MultiRNNCell(cells)

    # The second output is the last state and we will no use that
    outputs, _ = tf.nn.bidirectional_dynamic_rnn(stack,stack_bw, inputs, seq_len, dtype=tf.float32)

    shape = tf.shape(inputs)
    batch_s, max_timesteps = shape[0], shape[1]

    # Reshaping to apply the same weights over the timesteps
    outputs = tf.reshape(outputs, [-1, num_hidden])

    # Truncated normal with mean 0 and stdev=0.1
    # Tip: Try another initialization
    # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
    W = tf.Variable(tf.truncated_normal([num_hidden,
                                         num_classes],
                                        stddev=0.1))
    # Zero initialization
    # Tip: Is tf.zeros_initializer the same?
    b = tf.Variable(tf.constant(0., shape=[num_classes]))

    # Doing the affine projection
    logits = tf.matmul(outputs, W) + b

    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Time major
    logits = tf.transpose(logits, (1, 0, 2))

    loss = tf.nn.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.MomentumOptimizer(initial_learning_rate,
                                           0.9).minimize(cost)

    # Option 2: tf.nn.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)

    # Inaccuracy: label error rate
    ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          targets))

In [16]:
# Configuration to control GPU use
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.33
# sess = tf.InteractiveSession(config=config)

num_batches_per_epoch = int(len(x_)/batch_size)

with tf.Session(graph=graph,config=config) as session:
    # Initializate the weights and biases
    tf.global_variables_initializer().run()


    for curr_epoch in range(num_epochs):
        train_cost = train_ler = 0
        start = time.time()

        for batch in range(num_batches_per_epoch//20):
            b = int(np.random.randint(len(x_), size=1))
            if not batch%100:
                print('batch:', batch, b, x_[b].shape)
                
            feed = {inputs: x_[b],
                    targets: y_[b],
                    seq_len: [ x_[b].shape[1] ]}

            batch_cost, _ = session.run([cost, optimizer], feed)
            train_cost += batch_cost*batch_size
            train_ler += session.run(ler, feed_dict=feed)*batch_size

        train_cost /= num_examples
        train_ler /= num_examples
        
        print(num_epochs, train_cost, train_ler,time.time() - start)

        val_cost, val_ler = 0, 0
        for batch in range(len(val_x_)//5):
            b = int(np.random.randint(len(val_x_), size=1) )
            val_feed = {inputs: val_x_[b],
                        targets: val_y_[b],
                        seq_len: [val_x_[b].shape[1]]}

            val_c, val_l = session.run([cost, ler], feed_dict=val_feed)
            val_cost += val_c
            val_ler += val_l 

        log = "Epoch {}/{}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}"
        print(log.format(curr_epoch+1, num_epochs, train_cost, train_ler,
                         val_cost, val_ler, time.time() - start))
    
    
    
    for batch in range(len(test_x_)//100):
        b = int(batch)
        test_feed = {inputs: test_x_[b],
                        targets: test_y_[b],
                        seq_len: [test_x_[b].shape[1]]}
            
        # Decoding
        d = session.run(decoded[0], feed_dict=test_feed)
        str_decoded = ''.join([chr(x) for x in np.asarray(d[1]) + FIRST_INDEX])
        # Replacing blank label to none
        str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
        # Replacing space label to space
        str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')

        print('Original:', test_text_[b])
        print('Decoded:' , str_decoded)

batch: 0 2459 (1, 316, 13)
batch: 100 4393 (1, 383, 13)
batch: 200 1367 (1, 159, 13)
10 inf 422.82654052972794 254.10722303390503
Epoch 1/10, train_cost = inf, train_ler = 422.827, val_cost = inf, val_ler = 15.096, time = 256.191
batch: 0 847 (1, 235, 13)
batch: 100 1760 (1, 136, 13)
batch: 200 1602 (1, 159, 13)
10 inf 323.74467927217484 254.82341027259827
Epoch 2/10, train_cost = inf, train_ler = 323.745, val_cost = inf, val_ler = 8.102, time = 256.877
batch: 0 3585 (1, 322, 13)
batch: 100 3500 (1, 364, 13)
batch: 200 2869 (1, 262, 13)
10 inf 281.2521094083786 261.80276584625244
Epoch 3/10, train_cost = inf, train_ler = 281.252, val_cost = inf, val_ler = 9.907, time = 263.860
batch: 0 521 (1, 220, 13)
batch: 100 56 (1, 371, 13)
batch: 200 3480 (1, 242, 13)
10 inf 271.51636749505997 262.4949402809143
Epoch 4/10, train_cost = inf, train_ler = 271.516, val_cost = inf, val_ler = 8.978, time = 264.502
batch: 0 1802 (1, 473, 13)
batch: 100 3639 (1, 338, 13)
batch: 200 2296 (1, 232, 13)
10 i