In [None]:
import pandas as pd
import numpy as np

np.random.seed(0)

pd.set_option('display.width', 180)
pd.set_option('display.max_columns', 20)

In [None]:
train = pd.read_csv('training_data.csv').rename(columns={'Unnamed: 1': 'index'})
train = train.set_index(['symbol', 'index'])

dat_cols = ['open', 'high', 'low', 'close', 'close_ratio', 'spread']

# train = train.xs(symbol, level=0).copy()
# Emulate data read direct from file
# train.index = pd.MultiIndex.from_arrays(
#     [train['symbol'].values, train.index.values], names=('symbol', 'index'))  # to be removed
# train = train.drop('symbol', axis=1)  # to be removed

seq_len = 5
mult = 3

In [None]:
# calculate useful features
for i, row in train.iterrows():
    try:
        train.loc[i, 'close_ratio'] = (row['close'] - row['low'])/(row['high'] - row['low'])
    except ZeroDivisionError:
        train.loc[i, 'close_ratio'] = 0.0

    try:
        train.loc[i, 'log_shares'] = np.log(np.ceil(row['market']/row['close']))
    except ZeroDivisionError:
        train.loc[i, 'log_shares'] = 0.

    train.loc[i, 'spread'] = row['high'] - row['low']

In [None]:
# build diff signals
dfs = {}
for col in dat_cols:
    new_col = 'v_' + col
    diff_signal = train.groupby('symbol')[col].apply(pd.DataFrame.diff).fillna(0.)
    diff_signal.name = new_col
    diffs = []
    for symbol, grp in train.groupby('symbol'):
        s_ = grp[col].diff().fillna(0.)
        # mi = pd.MultiIndex.from_arrays([[symbol]*len(s_.index), s_.index], names=['symbol', 'index'])
        # s_.index = mi
        s_.name = new_col
        diffs.append(s_)
    diff_signal = pd.concat(diffs, axis=0)
    dfs.update({new_col: diff_signal})  # track each diff'd series

In [None]:
# stack diff signals together and join back in to training data
v_cols = list(dfs.keys())
df = train.join(pd.concat(dfs.values(), axis=1)).rename(columns={'level_1': 'index'})

from sklearn.preprocessing import StandardScaler

In [None]:
# Scale to control for magnitude
df_ = pd.DataFrame()
scale_track = {}
for symbol, grp in df.groupby('symbol'):
    grp_ = grp.copy()  # don't alter original dataframe (grp is a slice/view)
    scale_track.update({symbol: {}})
    for col in dat_cols + v_cols:
        s = StandardScaler()
        s.fit(grp[col].values.reshape(-1, 1))  # add second dimension for data into scaler
        grp_[col] = s.transform(grp[col].values.reshape(-1, 1))  # apply transform

        scale_track[symbol].update({col: s})  # track scaling object

    df_ = df_.append(grp_)

df_.head()

In [None]:
train_ = df_.replace([np.inf, -np.inf], 0.0)

In [None]:
target_col = 'v_open'
feat_cols = dat_cols + v_cols + ['log_shares']

# Retrieve a single sample from the data
def gen_sample(df, idx, feat_cols, target_col):
    # select out sample to be predicted
    target_sample = df.loc[idx]

    # During dev, used data from a single symbol (BTC). In production, samples will be collected
    # across symbols, so dataset will be multi-indexed on symbol and 1-N index. Need separate logic
    # to handle those two cases
    if isinstance(df.index, pd.MultiIndex):  # cross section on multiindex
        df_ = df.xs(target_sample.name[0], level=0)
        idx = idx[1]  # don't need to preserve symbol value beyond cross section
    else:  # standard slice on index
        df_ = df.loc[df['symbol'].eq(target_sample['symbol']), :]

    min_index = df_.iloc[0].name
    n_fill = seq_len - (idx - min_index)

    # construct input feature sequence
    if n_fill > 0:  # requires filler samples
        # if randomly selected index doesn't have at least seq_len entries before it, we will need to fill in
        # values for the missing points
        train_slice = df_.loc[min_index:idx - 1, feat_cols]
        fill_df = pd.DataFrame([pd.Series()] * n_fill)
        train_slice = fill_df.append(train_slice, ignore_index=True, sort=False).fillna(0.)
    else:  # no filler required
        train_slice = df_.loc[idx - seq_len:idx - 1, feat_cols]

    return train_slice.values, target_sample[target_col]


# Generator to create input data samples
def gen_input(df, batch_size, shuffle=True):
    i = 1  # no point in trying to estimate initial sample (i=0)
    while True:
        if shuffle:
            # select random index value, will attempt to predict target at this point
            pred_idx = df.sample(n=batch_size).index
        else:
            if i + batch_size >= df.shape[0]:
                i = 1
            pred_idx = df.iloc[i:i+batch_size].index

        x_out, y_out = [], []
        for idx in pred_idx:
            x_sample, y_sample = gen_sample(df, idx, feat_cols, target_col)

            x_out.append(x_sample)
            y_out.append(y_sample)

        i += batch_size  # update starting point for next non-random generator sampling
        yield np.array(x_out), np.array(y_out)

In [None]:
# Model architecture
from keras.layers import GRU, Bidirectional, Dense, Input, BatchNormalization
from keras.models import Model
from keras.optimizers import Adam

train_scale = 10
batch_size = 32

# Construct model architecture
X_in = Input((seq_len, len(feat_cols)))
X = Bidirectional(
    GRU(
        units=50,
        activation='tanh',
        dropout=0.1,
        recurrent_dropout=0.1,
        return_sequences=False
    )
)(X_in)
X = BatchNormalization()(X)
X = Dense(20, activation='tanh')(X)
X = Dense(10, activation='relu')(X)
X = Dense(1)(X)

model = Model(inputs=X_in, outputs=X)

In [None]:
# Add optimizer
opt = Adam(lr=0.0008, beta_1=0.9, beta_2=0.999, decay=0.00)
model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_absolute_percentage_error'])
model.summary()

In [None]:
# train/val split
val = train_.reset_index().drop('index', axis=1).groupby('symbol').apply(
    pd.DataFrame.tail, mult*seq_len).drop('symbol', axis=1)
val = val.reset_index().reset_index().drop('level_1', axis=1).set_index(['symbol', 'index'])  # reset val index 0-N

train = train_.reset_index().drop('index', axis=1).groupby('symbol').apply(
    pd.DataFrame.head, mult*seq_len).drop('symbol', axis=1)

In [None]:
# ## TRAINING ##

# x_train, y_train = [], []
# # for idx in train.index[10:]:
# for _ in range(train_scale * len(train.index)//batch_size):
#     for idx in train.index[1:]:
#         x, y = gen_sample(train, idx, feat_cols, target_col)
#         x_train.append(x)
#         y_train.append(y)
# x_train, y_train = np.array(x_train), np.array(y_train)
#
# x_val, y_val = [], []
# for _ in range(val_steps * batch_size):
#     for idx in val.index[1:]:
#         x, y = gen_sample(val, idx, feat_cols, target_col)
#         x_val.append(x)
#         y_val.append(y)
# x_val, y_val = np.array(x_val), np.array(y_val)
#
# history = model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=10, validation_data=[x_val, y_val])

In [None]:
train_steps = train_scale*train.shape[0]//batch_size
val_steps = val.shape[0]//batch_size or 1

# NOTE: model.fit_generator() failed and caused SIGABRT. To fix, ran
# conda install nomkl
train_gen = gen_input(df=train, batch_size=batch_size, shuffle=True)
val_gen = gen_input(df=val, batch_size=batch_size, shuffle=False)

history = model.fit_generator(
    generator=train_gen,
    steps_per_epoch=train_steps,
    epochs=10,
    validation_data=val_gen,
    validation_steps=val_steps,
    verbose=1
)

In [None]:
# import os
# import pickle
#
# Export model
# out_file = os.path.join('model_out', 'model.h5')
# print('\nWriting trained model to file {}'.format(out_file))
# model.save(out_file)
#
# # Export scaler object
# scale_track.update({'dat_cols': dat_cols})
# with open(os.path.join('model_out', 'scaler.pkl'), 'wb') as out_file:
#     pickle.dump(scale_track, out_file)


In [None]:
# ## VALIDATION SET ##
# def transform_column(df, col, scale_track, groupby='symbol', reverse=False):
#     track = []
#     for label, grp in df.reset_index().groupby(groupby):
#         scale_obj = scale_track[label][col]  # retrieve scaler object (sklearn StandardScaler)
#         vals = grp[col].values.reshape(-1, 1)  # retrieve values to be transformed + reshape
#
#         # apply transform
#         if reverse:
#             vals_ = scale_obj.inverse_transform(vals)  # vals are (assumed to be) scaled
#         else:
#             vals_ = scale_obj.transform(vals)  # vals are unscaled
#
#         # track symbol AND index
#         mi = pd.MultiIndex.from_arrays([[label]*grp.shape[0], grp.index], names=[groupby, 'index'])
#         track.append(pd.Series(np.squeeze(vals_), index=mi, name=col))  # track within-symbol (groupby col)
#
#     transformed = pd.concat(track, axis=0)  # concatenate across symbols (groupby col)
#
#     return transformed


# recover position from velocity and initial starting posiion
def recover_position(v, p0):
    # Simple integration via cumsum
    # v(t) = p(0) + sum(v*dt); dt=1
    p = p0 + np.cumsum(v)
    return p


def transform_columns(df, cols, scale_track, groupby='symbol'):
    df_ = pd.DataFrame()
    for symbol, grp in df.groupby(groupby):
        grp_ = grp.copy()  # don't alter original dataframe (grp is a slice/view)
        for col in cols:
            s = scale_track[symbol][col]
            grp_[col] = s.inverse_transform(grp_[col].values.reshape(-1, 1))  # apply transform

        df_ = df_.append(grp_)

    return df_

In [None]:
# Test on validation data to establish prediction logic
test_df = val.copy()

print(test_df.head())

# Recover validation set values for comparison
# un-scale actual open price position values
val_df = transform_columns(val.reset_index(), cols=['open'], scale_track=scale_track, groupby='symbol')

val_df.head()

In [None]:
val_df = val_df.set_index(['symbol', 'index'])

test_steps = test_df.shape[0] // batch_size + 1
test_gen = gen_input(df=test_df, batch_size=batch_size, shuffle=False)
test_preds = model.predict_generator(test_gen, steps=test_steps)
test_preds = np.insert(test_preds, [0], [0.0])  # generator doesn't produce prediction for index 0

print(len(test_df))
print(len(test_preds))

In [None]:
# NOTE: generator repeats samples because batch_size test_df length isn't a multiple of batch_size. Trim
#       the extra predictions (look at test_preds; there will be exact repeat sequence starting at len(test_df))
test_df.loc[:, target_col] = test_preds[:val_df.shape[0]]

# un-scale predicted open price velocity values
test_df = transform_columns(test_df, cols=[target_col], scale_track=scale_track, groupby='symbol')

# recover open price position values (scaled); NOTE: requires initial starting point (open
# price @ time t0) from val_df, and must consider each symbol separately
for symbol in set(test_df.index.get_level_values('symbol')):
    test_df.loc[symbol, 'pred_open'] = recover_position(
        test_df.loc[symbol, target_col].values, val_df.loc[symbol, 'open'].values[0])

test_df.loc[:, 'val_open'] = val_df.loc[:, 'open']

test_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the predicted vs actual of a given symbol
def gen_plot(pred, actual, symbol):
    pred_ = pred.xs(symbol)
    actual_ = actual.xs(symbol)

    # calculate mean absolute distance (MAD)
    mad = np.round(np.mean(np.abs(pred_ - actual_)), 2)

    # create x-axis
    t = range(len(pred_))

    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    sns.lineplot(t, actual_, color='black', label='actual')
    sns.lineplot(t, pred_, color='blue', label='pred', ax=ax)
    _ = ax.set_title('Predicted vs Actual {} Market Open (MAD={})'.format(symbol, mad))

    return mad

mad = gen_plot(pred=test_df.loc[:, 'pred_open'], actual=test_df.loc[:, 'val_open'], symbol='BTC')