In [1]:
import os
import pandas
import IPython
import numpy as np
import scipy.io.wavfile as wav


wavdir = 'vctk-p225/wav48/p225'
txtdir = 'vctk-p225/txt/p225'

In [2]:
# train, test split
allwav = os.listdir(wavdir)

test_size = int(len(allwav)*.1)

test_files = np.random.choice(allwav, size=test_size)
train_files = list(set(allwav).difference(set( test_files)))
len(test_files), len(train_files)

(23, 210)

In [3]:
from python_speech_features import mfcc
f = os.path.join(wavdir,'p225_176.wav')
IPython.display.Audio(f)

In [4]:
def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape

In [5]:
audio_filename = os.path.join(wavdir,'p225_176.wav')
target_filename = os.path.join(txtdir,'p225_176.txt')
# Constants
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1  # 0 is reserved to space

def get_batch(audio_filename, target_filename):

    fs, audio = wav.read(audio_filename)

    inputs = mfcc(audio, samplerate=8000)

    train_inputs = np.asarray(inputs[np.newaxis, :])
    train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
    train_seq_len = [train_inputs.shape[1]]

    # Readings targets
    with open(target_filename, 'r') as f:

        line = f.read()

        original = ' '.join(line.strip().lower().split(' ')).replace(",","")\
                      .replace(".","").replace("?","").replace("'","").replace("-", '')
        targets = original.replace(' ', '  ')
        targets = targets.split(' ')

    # Adding blank label
    targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])

    # Transform char into index
    targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                          for x in targets])

    # Creating sparse representation to feed the placeholder
    train_targets = sparse_tuple_from([targets])

    return  train_inputs, train_targets,   train_seq_len, original

In [6]:
def get_data(train_files):
    data = []
    for i, f in enumerate(train_files):

        audio_filename = os.path.join(wavdir,f)
        target_filename = os.path.join(txtdir,f.replace(".wav", '.txt'))
        train_inputs, train_targets, train_seq_len, original = get_batch(audio_filename, target_filename)
        data.append((train_inputs, train_targets, train_seq_len, original))
    return data

train = get_data(train_files)
test = get_data(test_files)

# Model

In [7]:
import tensorflow as tf
num_features = 13
# Accounting the 0th index +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1
print(num_classes)

# Hyper-parameters
num_epochs = 100
num_hidden = 100
num_layers = 1
batch_size = 1

num_examples = 1

inputs = tf.placeholder(tf.float32, [None, None, num_features])

# Here we use sparse_placeholder that will generate a
# SparseTensor required by ctc_loss op.
targets = tf.sparse_placeholder(tf.int32)

# 1d array of size [batch_size]
seq_len = tf.placeholder(tf.int32, [None])

# Defining the cell
cell = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)

# Stacking rnn cells
stack = tf.contrib.rnn.MultiRNNCell([cell] * num_layers,
                                    state_is_tuple=True)

# The second output is the last state and we will no use that
outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)

shape = tf.shape(inputs)
batch_s, max_time_steps = shape[0], shape[1]

# Reshaping to apply the same weights over the timesteps
outputs = tf.reshape(outputs, [-1, num_hidden])


# fully connected layer
logits = tf.contrib.layers.fully_connected(outputs, num_classes, activation_fn = None)

# Reshaping back to the original shape
logits = tf.reshape(logits, [batch_s, -1, num_classes])

# Time major
logits = tf.transpose(logits, (1, 0, 2))

loss = tf.nn.ctc_loss(targets, logits, seq_len)
cost = tf.reduce_mean(loss)


optimizer = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9)#.minimize(cost)

# Gradient Clipping
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
training_op = optimizer.apply_gradients(capped_gradients)


# Option 2: tf.contrib.ctc.ctc_beam_search_decoder
decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)

# Inaccuracy: label error rate
ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                      targets))

  from ._conv import register_converters as _register_converters


28


In [8]:
import random
import time
from tqdm import tqdm

saver = tf.train.Saver()
with tf.Session() as sess:

    tf.global_variables_initializer().run()
#     saver.restore(sess, "./ctc_model.ckpt")

    for curr_epoch in range(num_epochs):
        train_cost = train_ler = 0
        start = time.time()
        
        idx = list(range(len(train_files)))
        random.shuffle(idx)
    
        for i, f in  enumerate(idx):
            train_inputs, train_targets, train_seq_len, original = train[f]
        
            feed = {inputs: train_inputs,
                        targets: train_targets,
                        seq_len: train_seq_len}

            _ = sess.run([training_op], feed)
            batch_cost, batch_ler = sess.run([cost, ler], feed_dict=feed)

            train_cost += batch_cost
            train_ler += batch_ler


        train_cost /= len(idx)
        train_ler /= len(idx)
        
        f = np.random.choice(list(range(len(test_files))))
        
        val_inputs, val_targets, val_seq_len, val_original = test[f]
        val_feed = {inputs: val_inputs,
                        targets: val_targets,
                        seq_len: val_seq_len}

        val_cost, val_ler = sess.run([cost, ler], feed_dict=val_feed)

        # Decoding
        d = sess.run(decoded[0], feed_dict=val_feed)
        str_decoded = ''.join([chr(x) for x in np.asarray(d[1]) + FIRST_INDEX])
        # Replacing blank label to none
        str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
        # Replacing space label to space
        str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')

        print('Original val: %s' % val_original)
        print('Decoded val: %s' % str_decoded)
        
        log = "Epoch {}/{}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}"
        print(log.format(curr_epoch+1, num_epochs, train_cost, train_ler,
                         val_cost, val_ler, time.time() - start))
        saver.save(sess, './ctc_model.ckpt')

Original val: many complicated ideas about the rainbow have been formed
Decoded val: e 
Epoch 1/100, train_cost = 947.238, train_ler = 1.096, val_cost = 265.664, val_ler = 0.965, time = 194.685
Original val: thats as it should be
Decoded val: ht  
Epoch 2/100, train_cost = 200.294, train_ler = 0.938, val_cost = 76.011, val_ler = 0.810, time = 200.406
Original val: but the commission is on a collision course with the government
Decoded val: t e 
Epoch 3/100, train_cost = 178.141, train_ler = 0.904, val_cost = 271.058, val_ler = 0.937, time = 196.208
Original val: we would hope to make progress on that next year
Decoded val: te   
Epoch 4/100, train_cost = 176.955, train_ler = 0.902, val_cost = 174.496, val_ler = 0.896, time = 198.971
Original val: it isnt a happy memory
Decoded val: t
Epoch 5/100, train_cost = 157.776, train_ler = 0.947, val_cost = 66.002, val_ler = 0.955, time = 198.703
Original val: six spoons of fresh snow peas five thick slabs of blue cheese and maybe a snack for he

Original val: he is delighted and he has every right to be
Decoded val: the 
Epoch 46/100, train_cost = 145.502, train_ler = 0.958, val_cost = 142.490, val_ler = 0.909, time = 188.323
Original val: we have a clean bill of health
Decoded val: i 
Epoch 47/100, train_cost = 130.418, train_ler = 0.951, val_cost = 88.472, val_ler = 0.933, time = 188.813
Original val: we would hope to make progress on that next year
Decoded val: i
Epoch 48/100, train_cost = 173.081, train_ler = 0.976, val_cost = 158.659, val_ler = 1.000, time = 189.295
Original val: thats as it should be
Decoded val: 
Epoch 49/100, train_cost = 140.242, train_ler = 0.987, val_cost = 79.820, val_ler = 1.000, time = 186.541
Original val: i think hes the favourite
Decoded val: t
Epoch 50/100, train_cost = 132.548, train_ler = 0.989, val_cost = 75.780, val_ler = 0.960, time = 191.956
Original val: six spoons of fresh snow peas five thick slabs of blue cheese and maybe a snack for her brother bob
Decoded val: i
Epoch 51/100, trai

Original val: she said the charity would continue to monitor the case
Decoded val: it 
Epoch 91/100, train_cost = 131.750, train_ler = 0.890, val_cost = 157.403, val_ler = 0.945, time = 192.316
Original val: he is delighted and he has every right to be
Decoded val: io 
Epoch 92/100, train_cost = 131.677, train_ler = 0.895, val_cost = 136.297, val_ler = 0.932, time = 185.620
Original val: he added that he was very sad about o neill
Decoded val: t o
Epoch 93/100, train_cost = 129.659, train_ler = 0.891, val_cost = 128.457, val_ler = 0.930, time = 190.531
Original val: it isnt a happy memory
Decoded val: ie o 
Epoch 94/100, train_cost = 127.647, train_ler = 0.885, val_cost = 74.005, val_ler = 0.864, time = 188.043
Original val: at that point the structure of the board was changed
Decoded val: it a 
Epoch 95/100, train_cost = 125.769, train_ler = 0.882, val_cost = 148.245, val_ler = 0.904, time = 188.491
Original val: but the commission is on a collision course with the government
Decoded 