Recurrent Neural Network model for regression in sequences
====

In this example we will use the RNNModel to set up an experiment over one of the Spice (http://spice.lif.univ-mrs.fr/index.php) competence for sequence prediction, held in 2016. We will start by downloading and preprocessing the dataset.

In [1]:
%matplotlib inline

ORIGIN_URL = 'http://spice.lif.univ-mrs.fr/data/2.spice.train'
DATASET_DIR = 'downloads'
DATASET_FILENAME = 'spice_dataset2.txt'

In [2]:
# add parent directory to python path
import sys
sys.path.append('../')

In [3]:
import numpy
import os
import urllib
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from quick_experiment import utils
utils.safe_mkdir(DATASET_DIR)

In [5]:
def maybe_download():
    """Downloads dataset if it doesn't exists"""
    filename = os.path.join(DATASET_DIR, DATASET_FILENAME)
    if os.path.exists(filename):
        return
    urllib.urlretrieve(ORIGIN_URL, filename)

maybe_download()

The dataset file consists on a series of numerical sequences, one per line, including a header line that we will ignore. We will try to predict the last element of each sequence. 

In [6]:
from sklearn.preprocessing import OneHotEncoder


def get_instances(elements, sequences):
    """Converts the elements to one hot encodings and divides them according to sequences."""
    encoder = OneHotEncoder()
    elements = encoder.fit_transform(elements)
    instances = []
    for start, end in sequences:
        instances.append(elements[start:end])
    return instances


def read_dataset():
    """Reads the dataset. Returns a list with sequences and a list of labels"""
    with open(os.path.join(DATASET_DIR, DATASET_FILENAME), 'r') as input_file:
        lines = input_file.readlines()[1:]  # Ignore the header
    # Split lines and convert numbers to one hot encodings.
    sequences = []  # A list with start, end of each sequence.
    elements = []
    labels = []
    current_start = 0
    for line in lines:
        values = line.split()
        # We discard the first element (sequence lenght) and the last one (sequence label)
        sequences.append((current_start, current_start + len(values) - 2))
        current_start += len(values) - 2
        for value in values[1:-1]:
            elements.append([int(value)])
        labels.append(values[-1])
    instances = get_instances(elements, sequences)
    return numpy.array(instances), numpy.array(labels)

instances, labels = read_dataset()

We can now create the dataset using the extracted instances and labels

In [295]:
from quick_experiment import dataset
dataset = reload(dataset)

samples = 1
partition_sizes = {'train': 0.7, 'test': 0.2, 'validation': 0.1}

splice_dataset = dataset.SequenceDataset()
splice_dataset.create_samples(instances, labels, samples, partition_sizes, use_numeric_labels=True)
logs_dirname = '../../results/examples/splice/'

In [296]:
# Remove previous directory
import shutil
try:
    shutil.rmtree(logs_dirname)
except OSError:
    pass

In [297]:
from quick_experiment import experiment
experiment = reload(experiment)
from quick_experiment.models import lstm, mlp
mlp = reload(mlp)
lstm = reload(lstm)

utils.safe_mkdir(logs_dirname)

config = {
    'model': lstm.LSTMModel,
    'model_arguments': {'hidden_layer_size': 20, 'batch_size': 5,
                        'logs_dirname': logs_dirname,
                        'log_values': 100, 'training_epochs': 1000, 'max_num_steps': 10}
}
splice_experiment = experiment.SampledExperiment(splice_dataset, config=config)

In [298]:
tf.reset_default_graph()
splice_experiment.run()

INFO:root:Creating model for sample 0
INFO:root:Training model


Fitting


INFO:root:Model trained
INFO:root:
	Precision	Recall	F1 Score
mean	0.2215	0.2215	0.2215
std	0.0	0.0	0.0


We are obtaining a very low accuracy, let's see what's going on...

In [299]:
predictions, true = splice_experiment.model.predict('test')

In [300]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true=true, y_pred=predictions)

0.2215

In [301]:
print numpy.unique(true, return_counts=True)
print predictions, numpy.unique(predictions, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([370, 336, 235, 438, 415, 415, 836, 221, 499, 235]))
[6 4 6 ..., 6 6 6] (array([3, 4, 5, 6]), array([   1,   79,    6, 3914]))


We don't see too much difference between the validation and test accuracy, and the labels predicted are "evenly" distributed. This tell us that the problem is the network does not have enough information to learn.

Seq2Seq prediction
==

To overcome the problem above, we are going to train now a model that predicts the next element on the sequence. We need to re-process the dataset.

In [6]:
from sklearn.preprocessing import OneHotEncoder


def get_instances(elements, sequences):
    """Converts the elements to one hot encodings and divides them according to sequences."""
    encoder = OneHotEncoder()
    elements = encoder.fit_transform(elements)
    instances = []
    for start, end in sequences:
        instances.append(elements[start:end])
    return instances


def read_dataset():
    """Reads the dataset. Returns a list with sequences and a list of labels"""
    with open(os.path.join(DATASET_DIR, DATASET_FILENAME), 'r') as input_file:
        lines = input_file.readlines()[1:]  # Ignore the header
    # Split lines and convert numbers to one hot encodings.
    sequences = []  # A list with start, end of each sequence.
    elements = []
    labels = []
    current_start = 0
    for line in lines:
        values = line.split()
        # We discard the first element (sequence lenght)
        sequences.append((current_start, current_start + len(values) - 1))
        current_start += len(values) - 1
        for value in values[1:]:
            elements.append([int(value)])
    instances = get_instances(elements, sequences)
    return numpy.array(instances)

instances = read_dataset()

In [7]:
from quick_experiment import dataset
dataset = reload(dataset)

samples = 1
partition_sizes = {'train': 0.7, 'test': 0.2, 'validation': 0.1}

unlabeled_splice_dataset = dataset.UnlabeledSequenceDataset()
unlabeled_splice_dataset.create_samples(instances, None, samples, partition_sizes)

In [8]:
from quick_experiment import experiment
experiment = reload(experiment)

In [9]:
logs_dirname = '../../results/examples/splice/sequence'

# Remove previous directory
import shutil
try:
    shutil.rmtree(logs_dirname)
except OSError:
    pass

utils.safe_mkdir(logs_dirname)

In [10]:
from sklearn import metrics
import logging

class SequenceExperiment(experiment.SampledExperiment):

    def _get_metrics(self, predictions):
        """Logs the values of the metric for the given predictions.
        
        Args:
            predictions: 2-uple. Both elements are arrays of shape [batch_size, sequence_lengths].
        """
        metric_values = []
        for true, prediction in predictions:
            metric_values.append(metrics.precision_recall_fscore_support(
                numpy.concatenate(true), numpy.concatenate(prediction), average='micro'
            )[:-1])
        metric_values = numpy.array(metric_values)
        report = ('\n\tPrecision\tRecall\tF1 Score\n' + 'mean\t' +
            '\t'.join([str(x) for x in metric_values.mean(axis=0)]) +
            '\nstd\t' + '\t'.join([str(x) for x in metric_values.std(axis=0)])
        )
        logging.info(report)

In [11]:
from quick_experiment.models import seq_lstm
config = {
    'model': seq_lstm.SeqLSTMModel,
    'model_arguments': {'hidden_layer_size': 20, 'batch_size': 25,
                        'logs_dirname': logs_dirname, 'learning_rate': 0.01,
                        'log_values': 100, 'training_epochs': 1000, 'max_num_steps': 20}
}
sequence_splice_experiment = SequenceExperiment(unlabeled_splice_dataset, config=config)

In [12]:
tf.reset_default_graph()
sequence_splice_experiment.run()

INFO:root:Creating model for sample 0
INFO:root:Training model


Fitting
Classifier loss at step 100 (0.06s): 2.32234048843
Validation performance 0.201591193676
Classifier loss at step 200 (0.09s): 2.29971575737
Validation performance 0.206646263599
Classifier loss at step 300 (0.09s): 2.17955088615
Validation performance 0.18696449697
Classifier loss at step 400 (0.09s): 2.16690039635
Validation performance 0.182362303138
Classifier loss at step 500 (0.09s): 2.16062998772
Validation performance 0.180122405291
Classifier loss at step 600 (0.09s): 2.15684652328
Validation performance 0.177772343159
Classifier loss at step 700 (0.09s): 2.15424728394
Validation performance 0.176242351532
Classifier loss at step 800 (0.09s): 2.15257668495
Validation performance 0.175250917673
Classifier loss at step 900 (0.09s): 2.1512966156
Validation performance 0.174859240651


INFO:root:Model trained
INFO:root:
	Precision	Recall	F1 Score
mean	0.179975294349	0.179975294349	0.179975294349
std	0.0	0.0	0.0


In [13]:
predictions, true = sequence_splice_experiment.model.predict('test')

In [14]:
predictions[:10]

array([ array([  0.,   7.,   3.,   8.,   5.,   8.,   8.,   7.,   8.,   2.,   8.,
         5.,   5.,   7.,   4.,   0.,   6.,   8.,   8.,   1.,   5.,   1.,
         0.,   6.,   0.,   2.,   8.,   8.,   6.,   5.,   1.,   1.,   6.,
        10.]),
       array([  6.,   3.,   1.,   4.,   6.,   0.,   5.,   6.,   4.,   8.,   5.,
         9.,   6.,   4.,   6.,   5.,   3.,   0.,   5.,   0.,   6.,   6.,
         4.,   6.,   3.,   2.,   6.,   6.,   2.,   3.,   5.,   6.,   5.,
         0.,   4.,   3.,   6.,   4.,   9.,   9.,   4.,   2.,   4.,   3.,
         0.,   6.,   4.,   8.,   8.,   1.,   4.,   8.,   3.,   8.,   6.,
         4.,  10.]),
       array([  1.,   2.,   6.,   2.,   3.,   0.,   6.,   4.,   8.,   0.,   6.,
         4.,   6.,   5.,   3.,   6.,   3.,   3.,   3.,   0.,   7.,   4.,
         5.,   8.,   4.,   6.,   6.,   3.,   1.,   0.,   4.,   2.,   1.,
         5.,   7.,   9.,   1.,   5.,   4.,   6.,   9.,   8.,   3.,   6.,
        10.]),
       array([  8.,   8.,   4.,   0.,   1.,   5.,  

In [15]:
true[:10]

array([ array([ 3.,  6.,  6.,  6.,  3.,  8.,  6.,  6.,  8.,  3.,  6.,  6.,  6.,
        3.,  8.,  6.,  6.,  3.,  3.,  6.,  3.,  6.,  6.,  6.,  3.,  3.,
        6.,  6.,  8.,  3.,  6.,  6.,  6.,  3.]),
       array([ 6.,  6.,  6.,  3.,  8.,  6.,  6.,  8.,  3.,  6.,  6.,  6.,  3.,
        3.,  6.,  6.,  8.,  3.,  6.,  6.,  6.,  6.,  6.,  3.,  3.,  6.,
        6.,  8.,  3.,  8.,  6.,  6.,  3.,  3.,  6.,  6.,  6.,  3.,  3.,
        6.,  6.,  6.,  6.,  3.,  3.,  6.,  6.,  8.,  3.,  8.,  6.,  6.,
        3.,  3.,  6.,  6.,  6.]),
       array([ 8.,  3.,  8.,  6.,  6.,  3.,  3.,  6.,  6.,  8.,  3.,  6.,  6.,
        6.,  3.,  3.,  6.,  6.,  8.,  3.,  6.,  3.,  8.,  6.,  6.,  8.,
        3.,  8.,  6.,  6.,  3.,  3.,  8.,  6.,  6.,  3.,  3.,  6.,  6.,
        6.,  6.,  3.,  8.,  6.,  6.]),
       array([ 3.,  6.,  6.,  6.,  3.,  8.,  6.,  6.,  3.,  3.,  6.,  6.,  6.,
        3.,  3.,  6.,  6.,  6.,  3.,  3.,  3.,  6.,  6.,  8.,  3.,  8.,
        6.,  6.,  3.,  3.,  6.]),
       array([ 6.,  8.,

In [16]:
all_true = numpy.concatenate(true)
all_predicted = numpy.concatenate()

---
