In [2]:
import theano
from theano import tensor

import numpy

In [134]:
def create_examples(N=50,L=10):
    examples = []
    labels = []
    for n in xrange(N):
        pp = numpy.random.rand()
        ex = []
        for l in xrange(L):
            if numpy.random.rand() < pp:
                ex.append(0)
            else:
                ex.append(1)
        labels.append(numpy.sum(ex))
        examples.append(ex)
    return examples, labels

In [135]:
# create some toy datasets
all_x, all_y = create_examples(400, 10)
train_x, train_y = all_x[:300], all_y[:300]
valid_x, valid_y = all_x[300:350], all_y[300:350]
test_x, test_y = all_x[350:], all_y[350:]

In [121]:
# define a recursive function
def f_rec(x_, h_, U):
    h_new = tensor.tanh(x_ + tensor.dot(h_, U))
    return h_new

In [122]:
d_memory = 5
d_input = 2

# initialize the parameter nodes
W = theano.shared(0.1 * numpy.random.randn(d_input, d_memory).astype('float32'))
U = theano.shared(0.1 * numpy.random.randn(d_memory, d_memory).astype('float32'))
V = theano.shared(numpy.random.randn(d_memory).astype('float32'))

# initialize the input and output nodes
x = tensor.vector(dtype='int64')
y = tensor.scalar(dtype='int64')

# build a computational graph
x_emb = W[x]
h, updates = theano.scan(f_rec, x_emb, outputs_info=[tensor.alloc(0., d_memory)], non_sequences=[U])
y_pred = tensor.nnet.softplus(tensor.dot(h[-1], V))

# define an empirical cost function
cost = ((y - y_pred)**2).mean()

In [123]:
# prediction function
f_pred = theano.function([x], y_pred)

In [124]:
# cost function
f_cost = theano.function([x,y], cost)

In [125]:
# compute gradient with the reverse mode automatic differentiation
grad0 = theano.grad(cost, [W, U, V])
f_grad = theano.function([x,y], grad0)

In [126]:
# gradient clipping
clip = 1.
grad_norm = tensor.sum([(g**2).sum() for g in grad0])
grad = [tensor.switch(grad_norm > clip ** 2, 
                      g / tensor.sqrt(grad_norm) * clip, g) for g in grad0]

In [127]:
# update rule
lrate = tensor.scalar(dtype='float32')
f_update = theano.function([x, y, lrate], [cost], 
                           updates={(W, W-lrate*grad[0]), 
                                    (U, U-lrate*grad[1]),
                                    (V, V-lrate*grad[2])})

In [140]:
# online stochastic gradient descent 
n_epochs = 1000
lrate0 = 0.01
ui = 0

W.set_value(0.01 * numpy.random.randn(d_input, d_memory).astype('float32'))
U.set_value(0.01 * numpy.random.randn(d_memory, d_memory).astype('float32'))
V.set_value(0.01 * numpy.random.randn(d_memory).astype('float32'))

verrs = []
patience = 5
violations = 0

train_cost = 0

for ei in xrange(n_epochs):
    for ti in xrange(len(train_x)):
        #lrate = numpy.float32(lrate0 / (1.+.0001 * ui))
        lrate = lrate0
        xx = numpy.array(train_x[ti])
        yy = train_y[ti]
        cc = f_update(xx, yy, numpy.float32(lrate))
        if ui == 0:
            train_cost = cc[0]
        else:
            train_cost = 0.9 * train_cost + 0.1 * cc[0]
        ui += 1
    # early stopping based on validation cost
    if numpy.mod(ei, 10) == 0:
        vcc = 0
        verr = 0
        for ti in xrange(len(valid_x)):
            xx = numpy.array(valid_x[ti])
            yy = valid_y[ti]
            vcc += f_cost(xx, yy)
            yp = f_pred(xx)
            if yy != numpy.round(yp):
                verr += 1
        vcc = vcc / len(valid_x)
        print 'Epoch', ei, 'Train Cost', train_cost, 'Validation Cost', vcc, 'Validation Error', (numpy.float(verr) / len(valid_x))
        if vcc > 1.1 * numpy.min(verrs+[numpy.Inf]):
            violations += 1
            if violations > patience:
                print 'Early Stop!'
                break
        else:
            violations = 0
        verrs.append(vcc)

Epoch 0 Train Cost 27.020450751 Validation Cost 24.7816032554 Validation Error 0.92
Epoch 10 Train Cost 2.44860515717 Validation Cost 2.32291115537 Validation Error 0.76
Epoch 20 Train Cost 0.202131001106 Validation Cost 0.353881515281 Validation Error 0.48
Epoch 30 Train Cost 0.0989434779367 Validation Cost 0.167754565449 Validation Error 0.32
Epoch 40 Train Cost 0.107455457035 Validation Cost 0.0922564752643 Validation Error 0.08
Epoch 50 Train Cost 0.0876225755292 Validation Cost 0.10500823707 Validation Error 0.14
Epoch 60 Train Cost 0.0687496820036 Validation Cost 0.0648581382768 Validation Error 0.1
Epoch 70 Train Cost 0.0770556380988 Validation Cost 0.13337330856 Validation Error 0.18
Epoch 80 Train Cost 0.0744899941375 Validation Cost 0.0563583147246 Validation Error 0.08
Epoch 90 Train Cost 0.0554657230202 Validation Cost 0.0833183467301 Validation Error 0.08
Epoch 100 Train Cost 0.0626825154783 Validation Cost 0.150683822951 Validation Error 0.22
Epoch 110 Train Cost 0.054090

In [141]:
# test the trained model on the test set
tcc = 0
terr = 0
for ti in xrange(len(test_x)):
    xx = numpy.array(test_x[ti])
    yy = test_y[ti]
    tcc += f_cost(xx, yy)
    yp = f_pred(xx)
    if yy != numpy.round(yp):
        terr += 1
tcc = tcc / len(test_x)
print 'Test Cost', tcc, 'Test Error', (numpy.float(terr) / len(test_x))

Test Cost 0.126750723303 Test Error 0.18
