In [8]:
import tensorflow as tf
import numpy as np
import sys
import pandas as pd
import datetime

sys.path.append("../") # go to parent dir
from util.read_data import DataReader
from util.evaluator import ModelEvaluator

tf.enable_eager_execution()


1000 iter, 9 features, train >= 1.1.2012

lstm 20: MSE = 7.512969e+17, R2 = -0.909, confidence interval 95% = (359,183,212 - 514,268,732)
lstm 20: MSE = 1.221964e+17, R2 = 0.737, confidence interval 95% = (-21,071,013 - 17,874,275)

1000 iter, 26 features, train >= 1.1.2000
lstm 20 on test: MSE = 1.007057e+18, R2 = -1.558, confidence interval 95% = (639,482,637 - 819,035,560)

1000 iter, 26 features, train >= 1.1.2012
lstm 20 on test: MSE = 1.092978e+18, R2 = -1.777, confidence interval 95% = (555,998,192 - 743,053,957)

2000 iter, 26 features, train >= 1.1.2000, deeper LSTM
lstm 20 on test: MSE = 8.831816e+17, R2 = -1.244, confidence interval 95% = (526,764,737 - 694,912,253)
lstm 20: MSE = 5.446784e+15, R2 = 0.988, confidence interval 95% = (-8,872,915 - -650,567)

In [19]:
# read test and train data
reader = DataReader()
df = reader.read_normalized_data_for_rnn('../data/S&P500.csv')
train_features, train_volume = reader.get_train_data(df)
test_features, test_volume = reader.get_test_data(df)
evaluator = ModelEvaluator(reader.label_scaler)

In [20]:
class DataLoader():
    def __init__(self, features, volume):
        self.features = features
        self.volume = volume

    def get_batch(self, seq_length, batch_size):
        seq = []
        next_volume = []
        for i in range(batch_size):
            index = np.random.randint(0, len(self.volume) - seq_length)
            seq.append(self.features[index:index+seq_length].values)
            next_volume.append(self.volume[index+seq_length])
        return np.array(seq), np.array(next_volume).reshape(-1, 1)


def predict_volume(rnn_model, features):
    volume_pred = []
    for i in range(len(features) - seq_length):
        x = features[i:i + seq_length]
        x = x.values.reshape(1, seq_length, x.shape[1])
        volume_pred.append(rnn_model(x).numpy()[0, 0])
    return pd.Series(volume_pred)

In [21]:
class RNN(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.cell1 = tf.nn.rnn_cell.BasicLSTMCell(num_units=512)
        self.cell2 = tf.nn.rnn_cell.BasicLSTMCell(num_units=512)
        self.dense1 = tf.keras.layers.Dense(units=1024)
        self.dense2 = tf.keras.layers.Dense(units=1)

    def call(self, inputs):
        batch_size, seq_length, _ = tf.shape(inputs)
        state1 = self.cell1.zero_state(batch_size=batch_size, dtype=tf.float32)
        state2 = self.cell2.zero_state(batch_size=batch_size, dtype=tf.float32)
        for t in range(seq_length.numpy()):
            output, state1 = self.cell1(inputs[:, t, :], state1)
            output, state2 = self.cell2(output, state2)
            output = self.dense1(output)
            output = self.dense2(output)
        return output

    def predict(self, inputs, temperature=1.):
        batch_size, _ = tf.shape(inputs)
        output = self(inputs)
        return output.numpy()


In [22]:
learning_rate = 1e-3
batch_size = 64
seq_length = 20
num_batches = 500

In [24]:
data_loader = DataLoader(train_features,train_volume)
model = RNN()
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
for batch_index in range(num_batches):
    X, y = data_loader.get_batch(seq_length, batch_size)
    with tf.GradientTape() as tape:
        y_pred = model(X)
        loss = tf.losses.mean_squared_error(labels=y, predictions=y_pred)
        if (batch_index % 100 == 0):
            print("batch %d: loss %f" % (batch_index, loss.numpy()))
    grads = tape.gradient(loss, model.variables)
    optimizer.apply_gradients(grads_and_vars=zip(grads, model.variables))

batch 0: loss 0.898700
batch 100: loss 0.957565
batch 200: loss 0.609967
batch 300: loss 0.427986
batch 400: loss 0.406696


In [25]:
print(evaluator.evaluate("lstm {} on train".format(seq_length), train_volume[seq_length:], 
                         predict_volume(model, train_features)))

print(evaluator.evaluate("lstm {} on test".format(seq_length), test_volume[seq_length:], 
                         predict_volume(model, test_features)))

lstm 20 on train: MSE = 6.466866e+17, R2 = 0.736, confidence interval 95% = (-86,446,110 - -38,131,200)
lstm 20 on test: MSE = 3.216345e+17, R2 = 0.183, confidence interval 95% = (115,760,305 - 217,232,428)
