Recurrent Neural Network model for regression in sequences
====

In this example we will use the RNNModel to set up an experiment over one of the Spice (http://spice.lif.univ-mrs.fr/index.php) competence for sequence prediction, held in 2016. We will start by downloading and preprocessing the dataset.

In [None]:
%matplotlib inline

ORIGIN_URL = 'http://spice.lif.univ-mrs.fr/data/2.spice.train'
DATASET_DIR = 'downloads'
DATASET_FILENAME = 'spice_dataset2.txt'

In [None]:
# add parent directory to python path
import sys
sys.path.append('../')

In [None]:
import numpy
import os
import urllib
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import utils
utils.safe_mkdir(DATASET_DIR)

In [None]:
def maybe_download():
    """Downloads dataset if it doesn't exists"""
    filename = os.path.join(DATASET_DIR, DATASET_FILENAME)
    if os.path.exists(filename):
        return
    urllib.urlretrieve(ORIGIN_URL, filename)

maybe_download()

The dataset file consists on a series of numerical sequences, one per line, including a header line that we will ignore. We will try to predict the last element of each sequence. 

In [None]:
from sklearn.preprocessing import OneHotEncoder



def get_instances(elements, sequences):
    """Converts the elements to one hot encodings and divides them according to sequences."""
    encoder = OneHotEncoder()
    elements = encoder.fit_transform(elements)
    instances = []
    for start, end in sequences:
        instances.append(elements[start:end])
    return instances


def read_dataset():
    """Reads the dataset. Returns a list with sequences and a list of labels"""
    with open(os.path.join(DATASET_DIR, DATASET_FILENAME), 'r') as input_file:
        lines = input_file.readlines()[1:]  # Ignore the header
    # Split lines and convert numbers to one hot encodings.
    sequences = []  # A list with start, end of each sequence.
    elements = []
    labels = []
    current_start = 0
    for line in lines:
        values = line.split()
        # We discard the first element (sequence lenght) and the last one (sequence label)
        sequences.append((current_start, current_start + len(values) - 2))
        current_start += len(values) - 2
        for value in values[1:-1]:
            elements.append([int(value)])
        labels.append(values[-1])
    instances = get_instances(elements, sequences)
    return numpy.array(instances), numpy.array(labels)

instances, labels = read_dataset()

We can now create the dataset using the extracted instances and labels

In [None]:
import dataset
dataset = reload(dataset)

samples = 1
partition_sizes = {'train': 0.7, 'test': 0.2, 'validation': 0.1}

splice_dataset = dataset.SequenceDataset()
splice_dataset.create_samples(instances, labels, samples, partition_sizes, use_numeric_labels=True)
logs_dirname = '../../results/examples/splice/'

In [None]:
# Remove previous directory
import shutil
try:
    shutil.rmtree(logs_dirname)
except OSError:
    pass

In [None]:
import experiment
experiment = reload(experiment)
from models import lstm, mlp
mlp = reload(mlp)
lstm = reload(lstm)

utils.safe_mkdir(logs_dirname)

config = {
    'model': lstm.LSTMModel,
    'model_arguments': {'hidden_layer_size': 20, 'batch_size': 500,
                        'logs_dirname': logs_dirname,
                        'log_values': 100, 'training_epochs': 1000, 'max_num_steps': 10}
}
splice_experiment = experiment.SampledExperiment(splice_dataset, config=config)

In [None]:
tf.reset_default_graph()
splice_experiment.run()

We are obtaining a very low accuracy, let's see what's going on...

In [None]:
predictions, true = splice_experiment.model.predict('test')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true=true, y_pred=predictions)

In [None]:
print numpy.unique(true, return_counts=True)
print predictions, numpy.unique(predictions, return_counts=True)

We don't see too much difference between the validation and test accuracy, and the labels predicted are "evenly" distributed. This tell us that the problem is the network does not have enough information to learn.

Seq2Seq prediction
==

To overcome the problem above, we are going to train now a model that predicts the next element on the sequence. We need to re-process the dataset.

In [61]:
from sklearn.preprocessing import OneHotEncoder


def get_instances(elements, sequences):
    """Converts the elements to one hot encodings and divides them according to sequences."""
    encoder = OneHotEncoder()
    elements = encoder.fit_transform(elements)
    instances = []
    for start, end in sequences:
        instances.append(elements[start:end])
    return instances


def read_dataset():
    """Reads the dataset. Returns a list with sequences and a list of labels"""
    with open(os.path.join(DATASET_DIR, DATASET_FILENAME), 'r') as input_file:
        lines = input_file.readlines()[1:]  # Ignore the header
    # Split lines and convert numbers to one hot encodings.
    sequences = []  # A list with start, end of each sequence.
    elements = []
    labels = []
    current_start = 0
    for line in lines:
        values = line.split()
        # We discard the first element (sequence lenght)
        sequences.append((current_start, current_start + len(values) - 1))
        current_start += len(values) - 1
        for value in values[1:]:
            elements.append([int(value)])
    instances = get_instances(elements, sequences)
    return numpy.array(instances)

instances = read_dataset()

In [62]:
instances

array([ <44x10 sparse matrix of type '<type 'numpy.float64'>'
	with 44 stored elements in Compressed Sparse Row format>,
       <23x10 sparse matrix of type '<type 'numpy.float64'>'
	with 23 stored elements in Compressed Sparse Row format>,
       <13x10 sparse matrix of type '<type 'numpy.float64'>'
	with 13 stored elements in Compressed Sparse Row format>,
       ...,
       <21x10 sparse matrix of type '<type 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>,
       <25x10 sparse matrix of type '<type 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>,
       <37x10 sparse matrix of type '<type 'numpy.float64'>'
	with 37 stored elements in Compressed Sparse Row format>], dtype=object)

In [63]:
import dataset
dataset = reload(dataset)

samples = 1
partition_sizes = {'train': 0.7, 'test': 0.2, 'validation': 0.1}

unlabeled_splice_dataset = dataset.UnlabeledSequenceDataset()
unlabeled_splice_dataset.create_samples(instances, None, samples, partition_sizes)

In [66]:
import experiment
experiment = reload(experiment)
from models import lstm
lstm = reload(lstm)

In [67]:
logs_dirname = '../../results/examples/splice/sequence'
utils.safe_mkdir(logs_dirname)

In [84]:
from sklearn import metrics
import logging

class SequenceExperiment(experiment.SampledExperiment):

    def _get_metrics(self, predictions):
        """Logs the values of the metric for the given predictions.
        
        Args:
            predictions: 2-uple. Both elements are arrays of shape [batch_size, sequence_lengths].
        """
        metric_values = []
        for true, prediction in predictions:
            metric_values.append(metrics.precision_recall_fscore_support(
                numpy.concatenate(true), numpy.concatenate(prediction), average='micro'
            )[:-1])
        metric_values = numpy.array(metric_values)
        report = ('\n\tPrecision\tRecall\tF1 Score\n' + 'mean\t' +
            '\t'.join([str(x) for x in metric_values.mean(axis=0)]) +
            '\nstd\t' + '\t'.join([str(x) for x in metric_values.std(axis=0)])
        )
        logging.info(report)

In [89]:
config = {
    'model': lstm.SeqPredictionModel,
    'model_arguments': {'hidden_layer_size': 20, 'batch_size': 500,
                        'logs_dirname': None,
                        'log_values': 100, 'training_epochs': 1000, 'max_num_steps': 20}
}
sequence_splice_experiment = SequenceExperiment(unlabeled_splice_dataset, config=config)

In [90]:
tf.reset_default_graph()
sequence_splice_experiment.run()

Tensor("Mean:0", shape=(), dtype=float32)


INFO:root:Classifier loss at step 50000: 1.44864821434
INFO:root:Validation accuracy 0.0855234563351
INFO:root:Classifier loss at step 100000: 1.49653470516
INFO:root:Validation accuracy 0.0924467816949
INFO:root:Classifier loss at step 150000: 1.4352465868
INFO:root:Validation accuracy 0.103144004941
INFO:root:Classifier loss at step 200000: 1.53812026978
INFO:root:Validation accuracy 0.117533668876
INFO:root:Classifier loss at step 250000: 1.4862190485
INFO:root:Validation accuracy 0.131190270185
INFO:root:Classifier loss at step 300000: 1.46599245071
INFO:root:Validation accuracy 0.139905512333
INFO:root:Classifier loss at step 350000: 1.39895987511
INFO:root:Validation accuracy 0.144276723266
INFO:root:Classifier loss at step 400000: 1.47981023788
INFO:root:Validation accuracy 0.148104906082
INFO:root:Classifier loss at step 450000: 1.50725269318
INFO:root:Validation accuracy 0.152394652367
INFO:root:
	Precision	Recall	F1 Score
mean	0.155376909113	0.155376909113	0.155376909113
std	

In [92]:
predictions, true = sequence_splice_experiment.model.predict('test')

In [94]:
predictions[:10]

array([ array([ 2,  6,  1, 10,  3,  7,  1,  6,  9,  8,  3,  3,  9, 10,  8, 10,  4]),
       array([ 6,  6,  1,  3,  6,  9,  2,  8,  9,  8,  8,  6, 10, 10,  7,  6,  7,
        6,  5, 10]),
       array([ 6,  6,  2,  4, 10,  9,  5,  2,  9,  9,  3,  6, 10,  6,  8,  6,  4,
        5,  6, 10]),
       array([ 8,  6,  3,  4,  3,  5,  8,  0,  5, 10,  8,  4,  4,  2,  8,  3,  1,
        7,  4, 10]),
       array([6, 6, 1, 4, 6, 8, 6, 2]),
       array([ 6,  6,  3,  6,  3,  5,  1,  8,  9,  8,  3,  6,  9, 10,  7,  6,  4,
        6,  6, 10]),
       array([ 8,  4,  2,  4,  6,  9,  5,  8,  5,  3,  8, 10, 10,  0,  8,  0,  2,
        5,  6, 10]),
       array([6, 6, 3, 4, 3, 8, 8, 1, 9, 8]),
       array([ 7,  6,  7,  3, 10,  9,  2,  8,  5,  3,  3,  6,  0,  5,  8, 10,  4,
        6,  1, 10]),
       array([ 8,  0,  3, 10,  3,  0,  6,  6,  5])], dtype=object)

In [95]:
true[:10]

array([ array([  8.,   6.,   8.,   0.,   2.,   6.,   6.,   6.,   6.,   6.,   6.,
         8.,   3.,   7.,   6.,   7.,  10.]),
       array([  5.,   7.,   4.,   6.,   5.,   9.,   9.,   6.,   9.,   5.,   3.,
         5.,   2.,   0.,   3.,   2.,   0.,   6.,   5.,  10.]),
       array([  1.,   5.,   5.,   8.,   5.,   9.,   6.,   8.,   8.,   3.,   1.,
         6.,   7.,   2.,   3.,   9.,   4.,   6.,   5.,  10.]),
       array([  3.,   3.,   5.,   6.,   3.,   1.,   4.,   7.,   1.,   8.,   5.,
         7.,   6.,   2.,   8.,   8.,   2.,   6.,   6.,  10.]),
       array([  8.,   7.,   4.,   6.,   3.,   1.,   5.,  10.]),
       array([  3.,   8.,   2.,   8.,   0.,   6.,   6.,   6.,   8.,   0.,   6.,
         6.,   3.,   5.,   2.,   6.,   8.,   0.,   8.,  10.]),
       array([  8.,   5.,   2.,   8.,   5.,   0.,   0.,   9.,   1.,   9.,   1.,
         5.,   7.,   7.,   5.,   1.,   6.,   4.,   2.,  10.]),
       array([  0.,   3.,   4.,   6.,   1.,   9.,   2.,   6.,   5.,  10.]),
       array([  7.,

In [96]:
unlabeled_splice_dataset._labels[:10]

array([ array([ 6,  4,  0,  8,  8,  8,  5,  8,  9,  4,  0,  3,  9,  6,  2,  5,  3,
        9,  8,  4,  5,  0,  7,  4,  4,  8,  4,  2,  8,  7,  6,  3,  7,  0,
        1,  3,  7,  6,  4,  8,  1,  4,  7, 10]),
       array([ 6,  0,  1,  6,  4,  3,  9,  1,  2,  8,  8,  3,  6,  4,  4,  9,  5,
        0,  6,  3,  2,  0, 10]),
       array([ 1,  8,  4,  6,  7,  4,  1,  5,  7,  1,  9,  9, 10]),
       array([ 9,  0,  0,  8,  1,  6,  4,  4,  0,  4,  6,  8,  2,  2,  2,  8,  4,
        8,  9,  7,  0,  6,  3,  0,  5,  8,  1,  3,  3,  4,  6,  1,  5,  4,
        4,  1,  5,  6,  6,  6,  8,  8,  3,  8,  5,  3,  5,  6,  6,  7,  0,
        5,  2,  7,  5,  5,  3,  6,  9,  0,  6,  5,  2,  4,  5,  2,  6,  3,
        6,  1,  3,  2,  6,  3,  4,  6,  7,  6,  8,  8,  6,  4,  7,  3,  4,
        7,  8,  8,  7,  6,  0,  6,  3, 10]),
       array([ 0,  9,  0,  6,  4,  3,  8,  2,  7,  4,  8,  8,  6,  0,  5,  8, 10]),
       array([ 6,  6,  9,  8,  6,  8,  9,  0,  1,  6,  6,  8,  5, 10]),
       array([ 8,  3,  9,  