# RNN experiments

In [2]:
import tensorflow as tf
import numpy as np
import random

## Defining toy problems to solve with RNNs

To start experimenting with RNNs, let's start by defining some simple sequence-to-sequence and sequence-to-scalar supervised-learning tasks.

We'll constrain the problem by saying that, for each task, all input sequences must be the same length, and so must output sequences. All sequences will be composed of integers (30 'symbols') from 0-29, inclusive.

Each problem will take the form of a Python generator that yields a pair of sequences as numpy arrays (input and output).

In [33]:
n_symbols = 30

In [31]:
def sequence_repetition(length=10, num_vals=4, reverse=False, append_zeros=0):
    while True:
        vals = range(num_vals)
        seq = [random.choice(vals) for _ in range(length)]
        output = list(reversed(seq)) if reverse else seq
        seq = seq + [0] * append_zeros
        yield seq, output

In [87]:
years_vs_twos = [
    [[1,9,9,4], [2,0,0,7], [1,0,6,6], [0], [1,9,3,4], [1,9,5,6], [1,9,4,2], [1,7,7,6], [2,0,1,7], [2,0,0,0], [3,2,1]],
    [[2], [1,0,2,4], [2,0,4,8], [5,1,2], [4], [8], [6,4], [2,5,6], [0]]
]

short_vs_long = [
    [[1,1,1], [0,0,0], [0,0,0,0,0], [1,1,1,1,1]],
    [[0], [1]]
]

def sequence_classification(vocabularies=years_vs_twos):
    while True:
        vocab = random.choice(vocabularies)
        model = vocabularies.index(vocab)
        seq = []
        while len(seq) < 16:
            seq += random.choice(vocab)
        seq = seq[:16]
        yield seq, [model]

In [114]:
for x, y in sequence_classification():
    print x, y
    break

[1, 0, 2, 4, 6, 4, 6, 4, 2, 5, 6, 4, 2, 0, 4, 8] [1]


In [178]:
def sequence_extraction(length=8, sequence_max_length=2):
    # this model outputs nonsense that's occasionally wrapped between two zeors.
    # it should output only the nonsense between the zeros
    while True:
        seq = [random.randint(1, 9) for _ in xrange(length)]
        start_idx = random.randint(0, len(seq)-3)
        end_idx = random.randint(start_idx+2, min(start_idx+sequence_max_length+1, len(seq)-1))
        seq[start_idx] = 0
        seq[end_idx] = 0
        sub = seq[start_idx+1:end_idx]
        sub = sub + [0] * (sequence_max_length - len(sub))
        yield seq, sub

In [184]:
for x, y in sequence_extraction():
    print x, y
    break
    

[1, 8, 9, 5, 1, 0, 7, 0] [7, 0]


## Create a baseline model for solving this problem,

using a Tensorflow LSTM, to get a feel for how well LSTMs can do on this type of problem

In [169]:
def lrelu(x, leak=0.2, name="lrelu"): return tf.maximum(x, leak*x)

class SequenceModel(object):
    def __init__(self, sequence_generator, n_symbols, name='seq', cell_type=tf.contrib.rnn.BasicLSTMCell, cell_size=256, layers=1):
        self.sequence_generator = sequence_generator
        self.n_symbols = n_symbols
        self.name = name
        self.cell_type = cell_type
        self.cell_size = cell_size
        self.layers = layers
        
        self.epoch = 0
        
        
        for x, y in sequence_generator():
            self.input_len = len(x)
            self.output_len = len(y)
            break
        
        self.global_step = tf.contrib.framework.get_or_create_global_step()
        self.session = None
        
        with tf.variable_scope(self.name):
            self.build()
    
    def setup(self):
        if self.session: self.session.close()
        self.session = tf.InteractiveSession()
        self.session.run(tf.global_variables_initializer())
    
    def build(self):
        self.input = tf.placeholder(tf.int64, [None, self.input_len])
        self.target = tf.placeholder(tf.int64, [None, self.output_len])
        with tf.variable_scope(self.name):
            logits = self.model(self.input, self.output_len)
            self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.target, logits=logits)
            self.outputs = tf.argmax(logits, axis=-1)
            self.train_op = tf.train.AdamOptimizer(1e-4).minimize(self.loss)
            correct = tf.equal(self.outputs, self.target)
            self.accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    def model(self, sequence, output_len):
        # return the predicted sequence
        # encoder:
        x = tf.one_hot(sequence, self.n_symbols, dtype=tf.float32)
        
        single_cell = self.create_cell(self.cell_size)
        enc_cell = tf.contrib.rnn.MultiRNNCell([single_cell] * self.layers)
        
        outputs, state = tf.nn.dynamic_rnn(enc_cell, x, dtype=tf.float32, scope='enc')
        x = outputs[:, -1]
        # decoder:
        
        layer_sizes = ([self.cell_size] * self.layers)[:-1] + [self.n_symbols]
        dec_cell = tf.contrib.rnn.MultiRNNCell([self.create_cell(size) for size in layer_sizes])
        
        x = tf.tile(tf.expand_dims(x, 1), [1, output_len, 1])
        outputs, state = tf.nn.dynamic_rnn(dec_cell, x, dtype=tf.float32, scope='dec')
        return outputs
    
    def create_cell(self, size):
        return self.cell_type(size)
    
    def train(self, epochs):
        for x in xrange(epochs):
            for i, (input, output) in enumerate(self.iterate_batches()):
                self.train_on_batch(input, output)
            acc = self.eval_accuracy()
            print "Epoch {}: accuracy: {}%".format(self.epoch, acc * 100)
            self.epoch += 1
    
    def iterate_batches(self, batch_size=16, batches_per_epoch=100):
        for i in xrange(batches_per_epoch):
            inputs = []
            outputs = []
            for x, y in self.sequence_generator():
                inputs.append(x)
                outputs.append(y)
                if len(inputs) == batch_size:
                    break
            yield inputs, outputs
    
    def train_on_batch(self, input, output):
        feed = {
            self.input: input,
            self.target: output
        }
        self.session.run(self.train_op, feed_dict=feed)
    
    def eval_accuracy(self):
        num = 0
        denom = 0
        for x, y in self.iterate_batches():
            denom += len(x)
            num += len(x) * self.accuracy_for_batch(x, y)
        return num * 1.0 / denom
    
    def accuracy_for_batch(self, input, output):
        feed = {
            self.input: input,
            self.target: output
        }
        return self.session.run(self.accuracy, feed_dict=feed)
    
    def show_example(self):
        for inputs, outputs in self.iterate_batches(batch_size=1):
            feed = {
                self.input: inputs,
                self.target: outputs
            }
            results = self.session.run(self.outputs, feed_dict=feed)
            print "sequence input:", inputs[0]
            print "model responded with:", results[0]
            print " (expected {})".format(outputs[0])
            break


## Results

on `sequence_classification(vocabularies=years_vs_twos)`

- **LSTMCell**: Epoch 21: accuracy: 96.9375%
- **BasicRNNCell**: Epoch 24: accuracy: 100.625%


In [100]:
m = SequenceModel(sequence_classification, 10, name='lstm21')
m.setup()
print 'built model'
m.train(100)

built model
Epoch 0: accuracy: 49.8125%
Epoch 1: accuracy: 48.625%
Epoch 2: accuracy: 50.6875%
Epoch 3: accuracy: 58.5625%
Epoch 4: accuracy: 49.0625%
Epoch 5: accuracy: 60.125%
Epoch 6: accuracy: 50.6875%


KeyboardInterrupt: 

In [130]:
b1 = SequenceModel(sequence_classification, 10, name='rnn24', cell_type=tf.contrib.rnn.BasicRNNCell)
b2.setup()
print 'built model'
b3.train(100)

NameError: name 'b2' is not defined

In [109]:
m3 = SequenceModel(sequence_classification, 10, name='rnn24', cell_type=tf.contrib.rnn.GRUCell)
m3.setup()
print 'built model'
m3.train(100)

built model
Epoch 0: accuracy: 50.4375%
Epoch 1: accuracy: 51.125%
Epoch 2: accuracy: 47.9375%
Epoch 3: accuracy: 49.125%
Epoch 4: accuracy: 50.0625%
Epoch 5: accuracy: 49.5625%
Epoch 6: accuracy: 51.9375%
Epoch 7: accuracy: 51.6875%
Epoch 8: accuracy: 50.875%
Epoch 9: accuracy: 51.3125%
Epoch 10: accuracy: 49.8125%


KeyboardInterrupt: 

## Baseline without using recurrent networks

Just to make sure the recurrent stuff is working, build a classifier that only looks at the last timestep of the sequence, and see how well it does.

In [111]:
class NonRNNModel(SequenceModel):
    def model(self, sequence, output_len):
        x = tf.one_hot(sequence, self.n_symbols, dtype=tf.float32)
        x = x[:, -1, :]
        output_size = output_len * self.n_symbols
        for i, size in [128, output_size]:
            x = tf.layers.dense(x, size, tf.nn.relu, name='fc'+str(i))
        sequence = tf.reshape(x, [-1, output_len, self.n_symbols])
        return sequence

m4 = SequenceModel(sequence_classification, 10, name='non_rnn1')
m3.setup()
print 'built model'
m3.train(100)

built model
Epoch 0: accuracy: 49.3125%
Epoch 1: accuracy: 51.5625%
Epoch 2: accuracy: 50.9375%
Epoch 3: accuracy: 52.4375%
Epoch 4: accuracy: 49.0%
Epoch 5: accuracy: 95.0625%
Epoch 6: accuracy: 88.5625%
Epoch 7: accuracy: 52.1875%
Epoch 8: accuracy: 52.0625%
Epoch 9: accuracy: 51.8125%
Epoch 10: accuracy: 48.875%
Epoch 11: accuracy: 48.9375%
Epoch 12: accuracy: 52.0%
Epoch 13: accuracy: 49.0625%
Epoch 14: accuracy: 50.875%
Epoch 15: accuracy: 51.5%
Epoch 16: accuracy: 51.125%
Epoch 17: accuracy: 49.125%
Epoch 18: accuracy: 47.75%
Epoch 19: accuracy: 49.875%
Epoch 20: accuracy: 50.6875%
Epoch 21: accuracy: 48.8125%


KeyboardInterrupt: 

# RNN performance on sequence-extractor task

## Baseline: basic LSTM

Epoch 167: accuracy: 98.09375%

In [None]:
m = SequenceModel(sequence_extraction, 10, name='lstm38', cell_size=128, layers=1)
m.setup()
print 'built model'
for i in xrange(300):
    m.train(1)
    m.show_example()

built model
Epoch 0: accuracy: 30.28125%
sequence input: [8, 0, 4, 7, 0, 3, 4, 5]
model responded with: [0 0]
 (expected [4, 7])
Epoch 1: accuracy: 29.09375%
sequence input: [7, 5, 9, 5, 0, 9, 0, 8]
model responded with: [0 0]
 (expected [9, 0])
Epoch 2: accuracy: 28.25%
sequence input: [0, 9, 7, 0, 2, 4, 1, 1]
model responded with: [0 0]
 (expected [9, 7])
Epoch 3: accuracy: 29.65625%
sequence input: [9, 6, 8, 3, 0, 3, 1, 0]
model responded with: [0 0]
 (expected [3, 1])
Epoch 4: accuracy: 29.59375%
sequence input: [9, 0, 2, 0, 5, 2, 9, 5]
model responded with: [0 0]
 (expected [2, 0])
Epoch 5: accuracy: 28.78125%
sequence input: [6, 2, 9, 2, 0, 9, 0, 4]
model responded with: [0 0]
 (expected [9, 0])
Epoch 6: accuracy: 28.65625%
sequence input: [2, 8, 0, 7, 5, 0, 1, 2]
model responded with: [0 0]
 (expected [7, 5])
Epoch 7: accuracy: 31.21875%
sequence input: [0, 7, 0, 2, 6, 8, 4, 7]
model responded with: [0 0]
 (expected [7, 0])
Epoch 8: accuracy: 30.03125%
sequence input: [9, 0, 3, 

## Homemade GRU implementation

In [None]:
def gru_step(input, prev_state, in_size, out_size):
    # returns (output, new_state)
#             w = tf.get_variable('w', 
#                             shape=[patch_size, patch_size, in_channels, out_channels], 
#                             initializer=xavier_initializer())
    weights = tf.get_variable('w', shape=[3, size, size])
    biases = tf.get_variable('b', shape=[2, size], dtype=tf.float32)

def gru(inputs, out_size=128):
    in_size = inputs.get_shape()[-1].value
    state = tf.zeros(size, dtype=tf.float32)
    timesteps = inputs.get_shape()[1].value
    outputs = []
    for step in xrange(timesteps):
        with tf.variable_scope('gru', reuse=(step > 0)):
            output, state = gru_step(inputs[:, i, :], state, in_size, out_size)
            outputs.append(output)
    return tf.stack(outputs, axis=1)

class CustomGRUModel(SequenceModel):
    def model(self, sequence, output_len):
        # TODO

m4 = SequenceModel(sequence_classification, 10, name='non_rnn1')
m3.setup()
print 'built model'
m3.train(100)