In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [29]:
def strided_window(a, window_size, step_size):
    #https://stackoverflow.com/questions/40084931/taking-subarrays-from-numpy-array-with-given-stride-stepsize/40085052#40085052
    nrows = ((a.size-window_size)//step_size)+1
    n     = a.strides[0]
    return np.lib.stride_tricks.as_strided(a, shape=(nrows,window_size), strides=(step_size*n,n), writeable=False)

def generate_view(ohlcv_data, lookback):
    """
    Generates numpy view of ohlcv_data with shape (nrows-lookback, ncols*lookback)
    :ohlcv_data: a contiguous numpy array with shape (n,m)
    :lookback: an integer specifying how many periods to include in each row
    """
    ncols  = ohlcv_data.shape[1]
    prices = ohlcv_data.ravel()
    #get views (references) of prices (no copying, no extra memory)
    prices_strided = strided_window(prices, window_size=lookback*ncols, step_size = ncols)
    return prices_strided[:-1]


def get_raw_data(year):
    df = pd.read_csv('./Polygon/Raw/SPY_{}.csv'.format(year), engine='c', index_col=['t'], usecols = ['t','p','s'], dtype={'t':np.int64, 'p':np.float64, 's':np.float64})
    #convert index to pd.DatetimeIndex, timezone naive, daylight savings naive
    df.index = pd.to_datetime(df.index, unit='ns')
    #convert index to US-Eastern timezone, automatically takes care of daylight savings
    df.index = df.index.tz_localize('UTC').tz_convert('US/Eastern')
    #restrict data to trading hours
    df = df.between_time('09:30:00', '16:00:00')
    return df

def calc_ohlcv(df, period):
    #period = microseconds(U) | milliseconds(L) | seconds(S) | minutes(T) | hour(H)
    temp = df.resample(period).agg({'p': 'ohlc', 's': 'sum'})
    temp.columns = ['open','high','low','close','volume']
    temp['close'].fillna(method='ffill', inplace=True)
    temp['open'].fillna(temp['close'], inplace=True)
    temp['high'].fillna(temp['close'], inplace=True)
    temp['low'].fillna(temp['close'], inplace=True)
    return temp

def ohlcv_generator(year, resample_freq = 'T', measure = 'high', lookback_memory = 1):
    spy = get_raw_data(year)
    for month_name, month_group in spy.groupby(pd.Grouper(freq='M')):
        for day_name, day_group in month_group.groupby(pd.Grouper(freq='D')):
            #get ohlcv data
            ohlcv_temp = calc_ohlcv(day_group, resample_freq)
            #take measure as outcome
            Ytrain = ohlcv_temp[measure].values[lookback_memory:]
            #generate training data
            Xtrain = generate_view(ohlcv_temp.values, lookback_memory)
            for i in range(Xtrain.shape[0]):
                yield Xtrain[i], Ytrain[i]

In [30]:
test_generator = tf.data.Dataset.from_generator(
                    generator     = ohlcv_generator, args=[2003], 
                    output_types  = (tf.float32, tf.float32),
                    output_shapes =(tf.TensorShape([5]), tf.TensorShape([]))) 
     
test_generator

<DatasetV1Adapter shapes: ((5,), ()), types: (tf.float32, tf.float32)>

In [31]:
for trade_batch in test_generator.repeat().batch(5).take(3):
    print(trade_batch)

(<tf.Tensor: id=509, shape=(5, 5), dtype=float32, numpy=
array([[1.06850e+02, 1.06930e+02, 1.06840e+02, 1.06930e+02, 5.36500e+05],
       [1.06910e+02, 1.06950e+02, 1.06750e+02, 1.06920e+02, 1.14700e+05],
       [1.06930e+02, 1.06950e+02, 1.06890e+02, 1.06940e+02, 1.71600e+05],
       [1.06920e+02, 1.07010e+02, 1.06901e+02, 1.06990e+02, 1.56100e+05],
       [1.07000e+02, 1.07020e+02, 1.06910e+02, 1.06990e+02, 3.43300e+05]],
      dtype=float32)>, <tf.Tensor: id=510, shape=(5,), dtype=float32, numpy=array([106.95, 106.95, 107.01, 107.02, 107.03], dtype=float32)>)
(<tf.Tensor: id=511, shape=(5, 5), dtype=float32, numpy=
array([[1.0702e+02, 1.0703e+02, 1.0691e+02, 1.0702e+02, 2.9690e+05],
       [1.0702e+02, 1.0709e+02, 1.0695e+02, 1.0696e+02, 8.9600e+04],
       [1.0696e+02, 1.0701e+02, 1.0695e+02, 1.0696e+02, 2.5050e+05],
       [1.0696e+02, 1.0705e+02, 1.0690e+02, 1.0692e+02, 9.4700e+04],
       [1.0692e+02, 1.0697e+02, 1.0688e+02, 1.0689e+02, 9.2700e+04]],
      dtype=float32)>, <tf.T

In [None]:
EPOCHS = 10
BATCH_SIZE = 16
# using two numpy arrays
features, labels = (np.array([np.random.sample((100,2))]), 
                    np.array([np.random.sample((100,1))]))
dataset = tf.data.Dataset.from_generator(
                    generator     = ohlcv_generator, args=[2003], 
                    output_types  = (tf.float32, tf.float32),
                    output_shapes =(tf.TensorShape([5]), tf.TensorShape([]))).repeat().batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
x, y = iter.get_next()
# make a simple model
net = tf.layers.dense(x, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input
net = tf.layers.dense(net, 8, activation=tf.tanh)
prediction = tf.layers.dense(net, 1, activation=tf.tanh)
loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as label
train_op = tf.train.AdamOptimizer().minimize(loss)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(EPOCHS):
        _, loss_value = sess.run([train_op, loss])
        print("Iter: {}, Loss: {:.4f}".format(i, loss_value))