# Trial 6: structured sequence modeling

* Create simple parametric time series and try to model them.
* Add structure by constructing a graph between the series and see how it improves.
* Usage of `tflearn` inspired by [How to do time series prediction using RNNs, TensorFlow and Cloud ML Engine](https://medium.com/google-cloud/how-to-do-time-series-prediction-using-rnns-and-tensorflow-and-cloud-ml-engine-2ad2eeb189e8).

In [None]:
%matplotlib inline

import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow.contrib.learn as tflearn

plt.rcParams['figure.figsize'] = (17, 5)

In [None]:
DATA_DIR = os.path.join('..', 'data', 'structured_sequence_trial')

## 1 Data generation

In [None]:
SEQ_LEN = 100
N_SEQ = 4

def create_time_series(seq_len, random_state):
    freq = random_state.uniform(0.1, 0.6)
    ampl = random_state.uniform(0.5, 1.5)
    offset = random_state.uniform(-1, 1)
    return np.sin(np.arange(seq_len) * freq) * ampl + offset

rs = np.random.RandomState(42)
data = np.empty((N_SEQ, SEQ_LEN))
for i in range(N_SEQ):
    data[i] = create_time_series(SEQ_LEN, rs)
data = pd.DataFrame(data)

In [None]:
data.T.plot();
plt.savefig('time_series.pdf')
# hist

## 2 Graph construction

k-NN graph between the time series.

## 3 Data preparation

* Store data in TFRecords files which will be read by the input pipeline.
* Preprocessing can be done here.
* Data augmentation should be done in input pipeline (to save disk space).
* We are doing full batch, i.e. we feed data on the whole graph at once.

In [None]:
N_INPUTS = 10  # Number of samples used for prediction, i.e. unrolling length.
N_OUTPUTS = 1  # Number of samples in the time series the model tries to predict.

def feature(array):
    array = array.reshape(-1)
    return tf.train.Feature(float_list=tf.train.FloatList(value=list(array)))

def save_dataset(data, filename):
    """Save dataset as TFRecords."""
    filename = os.path.join(DATA_DIR, filename)
    num_examples = data.shape[1] - N_INPUTS - N_OUTPUTS + 1
    assert num_examples > 0
    tf.logging.info('Writing {} examples to {}'.format(num_examples, filename))
    with tf.python_io.TFRecordWriter(filename) as writer:
        for idx in range(num_examples):
            inputs = data[:, idx:idx+N_INPUTS]
            targets = data[:, idx+N_INPUTS:idx+N_INPUTS+N_OUTPUTS]
            example = tf.train.Example(features=tf.train.Features(feature={
                #'graph': feature(graph),  # Adjacency matrix or Laplacian can be stored here.
                'inputs': feature(inputs),
                'targets': feature(targets)}))
            writer.write(example.SerializeToString())

TRAINING_LEN = int(0.8 * SEQ_LEN)
save_dataset(data.iloc[:, :TRAINING_LEN].values, 'train.tfrecords')
save_dataset(data.iloc[:, TRAINING_LEN:].values, 'validation.tfrecords')

## 4 Data loading

Two training schemes:
* Load whole data for training up to a certain point in time. That is what is done for text (the whole vocabulary graph is used).
* Use some time series (some part of the graph) as training and the others as evaluation.

TF alternative:
* [tf.contrib.slim.dataset](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim)

In [None]:
class DataLoader:

    def __init__(s, filenames, num_epochs=1, batch_size=1, read_threads=1, seed=None):
        #if mode == tflearn.ModeKeys.TRAIN:
        s.filenames = filenames
        s.batch_size = batch_size
        s.num_epochs = num_epochs
        s.read_threads = read_threads
        s.seed = seed

    def _read_and_decode(s, filename_queue):
        reader = tf.TFRecordReader()
        _, example = reader.read(filename_queue)
        features={
            'inputs': tf.FixedLenFeature([N_SEQ * N_INPUTS], tf.float32),
            'targets': tf.FixedLenFeature([N_SEQ * N_OUTPUTS], tf.float32),
        }
        example = tf.parse_single_example(example, features)
        inputs = tf.reshape(example['inputs'], [N_SEQ, N_INPUTS])
        targets = tf.reshape(example['targets'], [N_SEQ, N_OUTPUTS])
        return inputs, targets

    def __call__(s):
        with tf.name_scope('input_queues'):
            #with tf.device("/cpu:0"):  # Input queues are on CPU.
            filenames = [os.path.join(DATA_DIR, filename) for filename in s.filenames]
            filename_queue = tf.train.string_input_producer(filenames, s.num_epochs, shuffle=True)

            examples = [s._read_and_decode(filename_queue) for _ in range(s.read_threads)]

            # Shuffle examples.
            if True:
                min_after_dequeue = 10 #10000
                capacity = min_after_dequeue + (s.read_threads + 2) * s.batch_size
                input_batch, target_batch = tf.train.shuffle_batch_join(
                        examples, batch_size=s.batch_size, seed=s.seed, capacity=capacity,
                        min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True)
            else:
                assert s.read_threads == 1
                input_batch, target_batch = examples[0]
            return {'inputs': input_batch}, target_batch

Make one pass over the dataset to make sure the input pipeline works.

In [None]:
inputs = DataLoader(['train.tfrecords'])()[0]['inputs']

sess = tf.Session()
#sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess, coord)

idx = 0
training_data = np.empty((N_SEQ, TRAINING_LEN-N_OUTPUTS))
try:
    while not coord.should_stop():
        training_data[:, idx:idx+N_INPUTS] = sess.run(inputs)
        idx += 1

except tf.errors.OutOfRangeError:
    print('Done: {} steps'.format(idx))
finally:
    coord.request_stop()

coord.join(threads)
sess.close()

#np.testing.assert_allclose(training_data, data.iloc[:, :TRAINING_LEN-N_OUTPUTS])