In [None]:
import nengo
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import Audio

import nengo_deeplearning.backends.theano.optimizers as opt
import nengo_deeplearning.processes as proc
from nengo_deeplearning.backends.theano.networks import RNN
from nengo_deeplearning.backends.theano.layers import Generic, GatedRecurrent, Dense
from skspeech.datasets import timit

## Train an RNN to decode VTGs from TIMIT

Notes:

- [Lasagne](http://lasagne.readthedocs.org/en/latest/user/tutorial.html)
- [HF2Nengo](https://github.com/nengo/nengo_deeplearning/pull/2)

CTC notes:

- [Blog post](http://andrew.gibiansky.com/blog/machine-learning/speech-recognition-neural-networks/)
- [pure Theano](https://github.com/mohammadpz/CTC-Connectionist-Temporal-Classification)
- [Lasagne](https://github.com/skaae/Lasagne-CTC/blob/master/ctc_cost.py)

In [None]:
data = timit.TIMIT("~/Dropbox/reading-auditory/data/timit")
data.filefilt.sent_number = (2,)
words = ['me', 'like', 'that']
n_frames = int(0.01 * data.fs)
frame_advance = int(0.005 * data.fs)
freqs = np.fft.rfftfreq(n_frames, 1. / data.fs)[1::2]
print freqs

trX = data.in_spectrogram(freqs, n_frames, frame_advance, words=words)
trY = data.out_vtg(words, n_frames, frame_advance)
trX = trX[:, np.newaxis, :]
print trX.shape, trY.shape
trX = np.clip(trX, 0, 3)

In [None]:
plt.figure(figsize=(14, 6))
plt.pcolormesh(trX[:500, 0, :].T)
plt.colorbar()

In [None]:
layers = [
    Generic(size=trX.shape[2]),
    GatedRecurrent(size=512, p_drop=0.2),
    Dense(size=trY.shape[1], activation='tanh', p_drop=0.5)
]

# A bit of l2 helps with generalization, higher momentum helps convergence
optimizer = opt.NAG(momentum=0.95, regularizer=opt.Regularizer(l2=1e-4))

# Linear iterator for real valued data, cce cost for softmax
model = RNN(layers=layers, optimizer=optimizer, iterator='linear', cost='mse')
model.fit(trX, trY, n_epochs=5)

tr_preds = model.predict(trX)  # [:len(teY)])
# te_preds = model.predict(teX)

# tr_acc = np.mean(np.argmax(trY[:len(teY)], axis=1) == np.argmax(tr_preds, axis=1))
print trY.shape, tr_preds.shape
tr_acc = np.sqrt(np.mean((trY - tr_preds) ** 2))
# te_acc = np.mean(np.argmax(teY, axis=1) == np.argmax(te_preds, axis=1))

print "  ====== Results ======"
# print "Train accuracy %s\tTest accuracy %s" % (tr_acc, te_acc)
print "Train accuracy %s" % tr_acc
model.save('trained_rnn.pkl')

In [None]:
st, end = 5000, 6000
plt.figure(figsize=(12, 4))
plt.plot(trY[st:end])
plt.figure(figsize=(12, 4))
plt.plot(tr_preds[st:end])
plt.figure(figsize=(12, 4))
plt.plot(tr_preds[st:end] - trY[st:end])