In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [105]:
def strided_window(a, window_size, step_size):
    #https://stackoverflow.com/questions/40084931/taking-subarrays-from-numpy-array-with-given-stride-stepsize/40085052#40085052
    nrows = ((a.size-window_size)//step_size)+1
    n     = a.strides[0]
    return np.lib.stride_tricks.as_strided(a, shape=(nrows,window_size), strides=(step_size*n,n), writeable=False)

def generate_view(ohlcv_data, lookback):
    """
    Generates numpy view of ohlcv_data with shape (nrows-lookback, ncols*lookback)
    :ohlcv_data: a contiguous numpy array with shape (n,m)
    :lookback: an integer specifying how many periods to include in each row
    """
    ncols  = ohlcv_data.shape[1]
    prices = ohlcv_data.ravel()
    #get views (references) of prices (no copying, no extra memory)
    prices_strided = strided_window(prices, window_size=lookback*ncols, step_size = ncols)
    return prices_strided[:-1]


def get_raw_data(year):
    df = pd.read_csv('./Polygon/Raw/SPY_{}.csv'.format(year), engine='c', index_col=['t'], usecols = ['t','p','s'], dtype={'t':np.int64, 'p':np.float64, 's':np.float64})
    #convert index to pd.DatetimeIndex, timezone naive, daylight savings naive
    df.index = pd.to_datetime(df.index, unit='ns')
    #convert index to US-Eastern timezone, automatically takes care of daylight savings
    df.index = df.index.tz_localize('UTC').tz_convert('US/Eastern')
    #restrict data to trading hours
    df = df.between_time('09:30:00', '16:00:00')
    return df

def calc_ohlcv(df, period):
    #period = microseconds(U) | milliseconds(L) | seconds(S) | minutes(T) | hour(H)
    temp = df.resample(period).agg({'p': 'ohlc', 's': 'sum'})
    temp.columns = ['open','high','low','close','volume']
    temp['close'].fillna(method='ffill', inplace=True)
    temp['open'].fillna(temp['close'], inplace=True)
    temp['high'].fillna(temp['close'], inplace=True)
    temp['low'].fillna(temp['close'], inplace=True)
    return temp

def ohlcv_generator(year, resample_freq = 'T', measure = 'high', lookback_memory = 1):
    try:
        resample_freq = resample_freq.decode("utf-8") 
        measure       = measure.decode("utf-8") 
    except:
        pass
    spy = get_raw_data(year)
    for day, day_group in spy.groupby([spy.index.month, spy.index.day]):
        #get ohlcv data
        ohlcv_temp = calc_ohlcv(day_group, resample_freq)
        #take measure as outcome
        Ytrain = ohlcv_temp[measure].values[lookback_memory:]
        #generate training data
        Xtrain = generate_view(ohlcv_temp.values, lookback_memory)
        for i in range(Xtrain.shape[0]):
            yield Xtrain[i], Ytrain[i]

def market_price_generator(year, resample_freq = 'U'):
    #FIX THIS
    spy = get_raw_data(year)
    for minute, minute_group in spy.groupby([spy.index.month, spy.index.day, spy.index.hour, spy.index.minute, spy.index.second]):
        market_prices = minute_group.p.resample(resample_freq).ffill()
        yield market_prices.values

In [106]:
#should be 1billion
len(next(market_price_generator(2003)))

500001

In [97]:
i = 0
for x, y in ohlcv_generator(2003,'T','close',5):
    i += 1
i

8347

In [99]:
YEAR = 2003
FREQ = 'T'
MSR  = 'close'
MEM  = 2
price_generator = tf.data.Dataset.from_generator(
                    generator     = ohlcv_generator, args=[YEAR,FREQ,MSR,MEM], 
                    output_types  = (tf.float32, tf.float32),
                    output_shapes =(tf.TensorShape([5*MEM]), tf.TensorShape([]))).repeat().batch(8).prefetch(8)
     
price_generator

<DatasetV1Adapter shapes: ((None, 10), (None,)), types: (tf.float32, tf.float32)>

In [100]:
model = tf.keras.Sequential([tf.keras.layers.Dense(32, input_shape=(5*MEM,)),
                             tf.keras.layers.Dense(64, activation=tf.nn.tanh),
                             tf.keras.layers.Dense(12, activation=tf.nn.softmax),
                             tf.keras.layers.Dense(32, activation=tf.nn.tanh),
                             tf.keras.layers.Dense(1, activation=tf.nn.relu)])

model.compile(optimizer='adam',
              loss='mean_absolute_error')

model.fit(price_generator, epochs=5, steps_per_epoch=8347)

Train for 8347 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe2ac3ee990>

In [None]:
#only works on 1D data (predict high based off prev highs only)
simple_lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(8, input_shape=(1,5)),
    tf.keras.layers.Dense(1)
])

simple_lstm_model.compile(optimizer='adam', loss='mae')

simple_lstm_model.fit(mydataset, epochs=EPOCHS, steps_per_epoch=EVALUATION_INTERVAL)