In [None]:
import os
import random
from zipfile import ZipFile

import nengo
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio

import nengo_deeplearning.backends.theano.optimizers as opt
import nengo_deeplearning.processes as proc
from nengo_deeplearning.backends.theano.networks import RNN
from nengo_deeplearning.backends.theano.layers import Generic, GatedRecurrent, LstmRecurrent, Dense
from skspeech import vtl
from skspeech import audio as skaudio
import skspeech

# Let's try all of the German CV gestures...
zippath = skspeech.data_path('ges-de-v.zip')
speaker = vtl.VTL.default_speaker

root = os.getcwd()
gesdir = os.path.join(root, 'ges')
wavdir = os.path.join(root, 'wav')
txtdir = os.path.join(root, 'txt')

In [None]:
if False:
    with ZipFile(zippath) as zf:
        zf.extractall(path=gesdir)

## Train an RNN to decode VTGs from synthesized utterances

Notes:

- [Lasagne](http://lasagne.readthedocs.org/en/latest/user/tutorial.html)
- [HF2Nengo](https://github.com/nengo/nengo_deeplearning/pull/2)

CTC notes:

- [Blog post](http://andrew.gibiansky.com/blog/machine-learning/speech-recognition-neural-networks/)
- [pure Theano](https://github.com/mohammadpz/CTC-Connectionist-Temporal-Classification)
- [Lasagne](https://github.com/skaae/Lasagne-CTC/blob/master/ctc_cost.py)

In [None]:
dt = 0.02
audio_f = skaudio.mfcc
audio_fargs = {'maxfreq': 2000,
               'preemph': 0,
               'energy': False,
               'remove_0': True,}

trX, trY = [], []
for gesfile in os.listdir(gesdir):
    gespath = os.path.join(gesdir, gesfile)
    wavpath = os.path.join(wavdir, "%s.wav" % gesfile[:-4])
    x, y, fs = vtl.get_traindata(gespath, audio_f, dt, audio_fargs, wavpath)
    trX.append(x), trY.append(y)

In [None]:
# Plot two random X feature trajectories
ix = range(len(trX))
random.shuffle(ix)
plt.figure(figsize=(14, 4))
plt.subplot(1, 2, 1)
plt.pcolormesh(trX[ix[0]].T[1:])
plt.title("trX[%d]" % ix[0])
plt.colorbar()
plt.subplot(1, 2, 2)
plt.pcolormesh(trX[ix[1]].T[1:])
plt.title("trX[%d]" % ix[1])
plt.colorbar()

In [None]:
# Plot the corresponding gesture trajectories
plt.figure(figsize=(14, 4))
plt.subplot(1, 2, 1)
plt.pcolormesh(trY[ix[0]].T)
plt.title("trY[%d]" % ix[0])
plt.ylim(49, 0)
plt.colorbar()
plt.subplot(1, 2, 2)
plt.pcolormesh(trY[ix[1]].T)
plt.title("trY[%d]" % ix[1])
plt.colorbar()

In [None]:
# Assemble the individual examples into training examples
n_cepstrum = trX[0].shape[1]
trX = np.concatenate(trX)[:, np.newaxis, :]
print trX.shape

# Delay the y by some ms
delay = int(0.04 * fs * dt)
print "delay frames: %d" % delay
trY = np.concatenate(trY)
trY = np.roll(trY, delay, axis=0)
trY[:delay] = 0.
print trY.shape

assert trX.shape[0] == trY.shape[0]

In [None]:
# Get a gesture score
trajs = []

dt = 0.02
gs = skspeech.vtl.parse_ges('ges-de-cvc/das.ges')
print gs.t_end
traj = gs.trajectory(dt=dt)
# For dot products, we change this slighty
# so that non-gestures are -1, gestures are 1
traj[traj > 0] = 2.
traj -= 1.
trajs.append(traj)

plt.pcolormesh(traj.T)
plt.colorbar()
traj.shape

In [None]:
# Get a gesture score
dt = 0.02
gs = skspeech.vtl.parse_ges('ges-de-cvc/bak.ges')
print gs.t_end
traj = gs.trajectory(dt=dt)
# For dot products, we change this slighty
# so that non-gestures are -1, gestures are 1
traj[traj > 0] = 2.
traj -= 1.
trajs.append(traj)

plt.pcolormesh(traj.T)
plt.colorbar()
traj.shape

In [None]:
trX = np.concatenate(trajs)
trX = trX[:, np.newaxis, :]

In [None]:
trX.shape

In [None]:
trY = np.zeros((trX.shape[0], len(trajs)))
ix = 0
for i, traj in enumerate(trajs):
    l = traj.shape[0]
    trY[ix:ix+l, i] = 1.
    ix += l
trY.shape

In [None]:
layers = [
    Generic(size=trX.shape[2]),
    GatedRecurrent(size=256, p_drop=0.2),
    Dense(size=trY.shape[1], activation='softmax', p_drop=0.5),
]

# A bit of l2 helps with generalization, higher momentum helps convergence
optimizer = opt.NAG(momentum=0.95, regularizer=opt.Regularizer(l2=1e-4))

# Linear iterator for real valued data, cce cost for softmax
model = RNN(layers=layers, optimizer=optimizer, iterator='linear', cost='cce')
model.fit(trX, trY, n_epochs=500)

tr_preds = model.predict(trX)  # [:len(teY)])
# te_preds = model.predict(teX)

tr_acc = np.mean(np.argmax(trY, axis=1) == np.argmax(tr_preds, axis=1))
# tr_acc = np.sqrt(np.mean((trY - tr_preds) ** 2))
# te_acc = np.mean(np.argmax(teY, axis=1) == np.argmax(te_preds, axis=1))

print "  ====== Results ======"
# print "Train accuracy %s\tTest accuracy %s" % (tr_acc, te_acc)
print "Train accuracy %s" % tr_acc
# model.save('trained_rnn.pkl')

In [None]:
st, end = 0, -1
plt.figure(figsize=(12, 4))
plt.plot(trY[st:end])
plt.figure(figsize=(12, 4))
plt.plot(tr_preds[st:end])
plt.figure(figsize=(12, 4))
plt.plot(tr_preds[st:end] - trY[st:end]);