Recurrent Neural Network model for regression in sequences
====

In this example we will use the RNNModel to set up an experiment over one of the Spice (http://spice.lif.univ-mrs.fr/index.php) competence for sequence prediction, held in 2016. We will start by downloading and preprocessing the dataset.

In [3]:
ORIGIN_URL = 'http://spice.lif.univ-mrs.fr/data/1.spice.train'
DATASET_DIR = 'downloads'
DATASET_FILENAME = 'spice_dataset.txt'

In [4]:
# add parent directory to python path
import sys
sys.path.append('../')

In [5]:
import numpy
import os
import urllib

In [6]:
import utils
utils.safe_mkdir(DATASET_DIR)

In [7]:
def maybe_download():
    """Downloads dataset if it doesn't exists"""
    filename = os.path.join(DATASET_DIR, DATASET_FILENAME)
    if os.path.exists(filename):
        return
    urllib.urlretrieve(ORIGIN_URL, filename)

maybe_download()

The dataset file consists on a series of numerical sequences, one per line, including a header line that we will ignore. We will try to predict the last element of each sequence. 

In [8]:
def read_dataset():
    """Reads the dataset. Returns a list with sequences and a list of labels"""
    with open(os.path.join(DATASET_DIR, DATASET_FILENAME), 'r') as input_file:
        lines = input_file.readlines()[1:]  # Ignore the header
    # Split lines and convert numbers to int.
    sequences = [[[int(value)] for value in line.split()] for line in lines]
    instances = [sequence[:-1] for sequence in sequences]
    labels = [sequence[-1] for sequence in sequences]
    return numpy.array(instances), numpy.array(labels)

In [9]:
instances, labels = read_dataset()

We can now create the dataset using the extracted instances and labels

In [10]:
import dataset
dataset = reload(dataset)

samples = 4
partition_sizes = {'train': 0.7, 'test': 0.2, 'validation': 0.1}

splice_dataset = dataset.SequenceDataset()
splice_dataset.create_samples(instances, labels, samples, partition_sizes, use_numeric_labels=True)

In [11]:
import experiment
experiment = reload(experiment)



In [12]:
import tensorflow as tf
splice_dataset.set_current_sample(0)
batch_size = 5

In [69]:
import random
a = numpy.array([range(1, random.randint(3, 8))
                       for _ in range(5)])
print a, a.shape
print numpy.apply_along_axis(lambda x: len(x), 1, a)

[[1, 2, 3] [1, 2, 3, 4, 5, 6, 7] [1, 2, 3, 4, 5, 6, 7] [1, 2, 3, 4, 5]
 [1, 2, 3, 4]] (5,)


ValueError: axis must be less than arr.ndim; axis=1, rank=1.

In [74]:
vf = numpy.vectorize(lambda x: len(x))
lengths = vf(a)
print a, lengths

[[1, 2, 3] [1, 2, 3, 4, 5, 6, 7] [1, 2, 3, 4, 5, 6, 7] [1, 2, 3, 4, 5]
 [1, 2, 3, 4]] [3 7 7 5 4]


In [80]:
pad = numpy.zeros((5, max(lengths)))
print [range(x) for x in lengths]
pad[3,:5]

[[0, 1, 2], [0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4], [0, 1, 2, 3]]


array([ 0.,  0.,  0.,  0.,  0.])

In [42]:
isinstance(numpy.zeros(10), numpy.ndarray)

True

In [83]:
numpy.where(a == [1,2,3])

  if __name__ == '__main__':


(array([], dtype=int64),)

In [84]:
[1,2,3] in a

  if __name__ == '__main__':


False

In [88]:
random.sample(k=5, population=range(10))

[0, 7, 1, 6, 5]