In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import plotly_express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from datetime import datetime, timedelta

import tensorflow as tf
# for device in tensorflow.config.experimental.list_physical_devices('GPU'):
#     tensorflow.config.experimental.set_memory_growth(device, True)
from keras.utils import np_utils
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Reshape, Conv2D, LSTM, Dense, MaxPooling2D, LeakyReLU, concatenate, Dropout
from tensorflow.keras.optimizers import Adam

import os

import LOBData
from func_tools import normalize, get_labels, cnn_data_reshaping, reshape_lob_levels, plot_labels, label_insights, get_pnl

In [None]:
import time
import multiprocessing

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict tf to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=6024)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)
tf.test.gpu_device_name()

## Inputs and data import

In [None]:
# preprocessing inputs
security = 'USDT_BTC'
raw_data_path = f'S3_data' # where json data is stored
root_caching_folder = "Processed_Data"
frequency = timedelta(seconds=10)
norm_type = 'dyn_z_score'

# labelling inputs
k_plus = 30#60
k_minus = 30#60
alpha = 0.001#0.0005
roll = 7200 * 6 # step from minute to 10 second data
# pull data from S3
#download_s3_data('limit-order-books-data-po-limitorderbooksnapshots-v25ungbmmak9', pair)

# Data import - needs to be adjusted importing from several files using Dask
data = pd.read_csv(f'{root_caching_folder}/{security}/data-cache-10s.csv', index_col=0)
lob_depth = data['Level'].max() + 1 # number of levels of order book

## Data preparation

In [None]:
# Train test split
train_test_split = int((data.shape[0] / lob_depth) * 0.7) # slice reference for train and test
train_timestamps = data['Datetime'].unique()[:train_test_split]
test_timestamps = data['Datetime'].unique()[train_test_split:]

train_cached_data = data[data['Datetime'].isin(train_timestamps)].set_index(['Datetime', 'Level'])
test_cached_data = data[data['Datetime'].isin(test_timestamps)].set_index(['Datetime', 'Level'])

print(f'Train dataset shape: {train_cached_data.shape} - Test dataset shape: {test_cached_data.shape}')

In [None]:
# Parallelized data size & price standardization for train and test set
number_of_workers = 4
inputs = (
    (train_cached_data[['Ask_Price', 'Bid_Price']], lob_depth, 'dyn_z_score', roll),
    (train_cached_data[['Ask_Size', 'Bid_Size']], lob_depth, 'dyn_z_score', roll),
    (test_cached_data[['Ask_Price', 'Bid_Price']], lob_depth, 'dyn_z_score', roll),
    (test_cached_data[['Ask_Size', 'Bid_Size']], lob_depth, 'dyn_z_score', roll)
    )

start_time = time.time()

with multiprocessing.Pool(number_of_workers) as p:
    res = p.starmap(normalize, inputs)
    res = list(res)
    #print(res)
    p.close()   
    p.join()
    
train_dyn_prices, train_dyn_volumes, test_dyn_prices, test_dyn_volumes = res[0], res[1], res[2], res[3]
print("--- %s seconds ---" % (time.time() - start_time))

# concat prices and volumes back together and top level (useful for Dash)
train_dyn_df = pd.concat([train_dyn_prices, train_dyn_volumes], axis=1).reset_index() # concat along row index
train_dyn_df[train_dyn_df['Level']==0].to_csv(f'{root_caching_folder}/{security}/TRAIN-{lob_depth}-{norm_type}-{roll}.csv') # save top level to csv 

test_dyn_df = pd.concat([test_dyn_prices, test_dyn_volumes], axis=1).reset_index() # concat along row index
test_dyn_df[test_dyn_df['Level']==0].to_csv(f'{root_caching_folder}/{security}/TEST-{lob_depth}-{norm_type}-{roll}.csv') # save top level to csv 

display(train_dyn_df.describe()) # check train data overview
display(test_dyn_df.describe()) # check test data overview

In [None]:
# 1 reshape to a format suitable for training
# 2 get mid px from normalized data
# 3 get labels from norm mid prices
# 4 labels one hot encoding

# train
train_depth_dyn, train_dt_index_dyn = reshape_lob_levels(train_dyn_df, output_type='array') # 1 train dataset
mid_px_train_dyn = pd.Series((train_depth_dyn[:,2] + train_depth_dyn[:,0]) / 2) # 2
labels_dyn_train = get_labels(mid_px_train_dyn, k_plus, k_minus, alpha, long_only=False) # 3
encoded_train_labels = np_utils.to_categorical(labels_dyn_train.values,3) # 4 train labels

# test
test_depth_dyn, test_dt_index_dyn = reshape_lob_levels(test_dyn_df, output_type='array') # 1 test dataset
mid_px_test_dyn = pd.Series((test_depth_dyn[:,2] + test_depth_dyn[:,0]) / 2) # 2
labels_dyn_test = get_labels(mid_px_test_dyn, k_plus, k_minus, alpha, long_only=False) # 3
encoded_test_labels = np_utils.to_categorical(labels_dyn_test.values,3) # 4 test labels

In [None]:
# Information about the newly generated labels
print('Train Labels')
train_transact_dyn = label_insights(labels_dyn_train)
print('\nTest Labels')
test_transact_dyn = label_insights(labels_dyn_test)
print(f'\nLabels Train as pctg of total: {test_transact_dyn/(test_transact_dyn+train_transact_dyn)}')

In [None]:
# Labels sanity check
def plot_data(norm_mid_px, labels, start, end, train_test_switch, y0=0):

    fig = make_subplots(rows=1, cols=1,specs=[[{"secondary_y": True}]])
    fig.update_layout(title=f'<b>Visual check: {train_test_switch}</b>', title_x=0.5)

    fig.add_trace(go.Scatter(y=norm_mid_px.values[start:end], x=norm_mid_px.index[start:end], name='mix_px_dyn_train'))   
    #fig.add_trace(go.Scatter(y=labels_dyn_train[start:end], name='labels_encoded'), secondary_y=True)

    background_color = plot_labels(labels[start:end], y0) # funct_tools formula to plot labels
    
    fig.update_layout(width=1200, 
        height=600,
        shapes=background_color,
        xaxis2= {'anchor': 'x','overlaying': 'x', 'side': 'top'},
        yaxis_domain=[0, 1])

    return fig

In [None]:
# plot train data
plot_data(mid_px_train_dyn, labels_dyn_train, 0, 30000, train_test_switch='train',y0=0)

In [None]:
# plot test data
plot_data(mid_px_test_dyn, labels_dyn_test, 0, 10000, train_test_switch='test',y0=0)

## Training

In [None]:
# Modified deeplob
def create_light_deeplob(T, NF, number_of_lstm):
    
    input_lmd = Input(shape=(T, NF, 1))
    # build the convolutional block
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(input_lmd)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)

    # build the convolutional block
    conv_first1 = Conv2D(32, (1, 2), strides=(1, 2))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(64, (1, 10))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    
    # build the inception module
    convsecond_1 = Conv2D(64, (1, 1), padding='same')(conv_first1)
    convsecond_1 = LeakyReLU(alpha=0.01)(convsecond_1)
    convsecond_1 = Conv2D(64, (3, 1), padding='same')(convsecond_1)
    convsecond_1 = LeakyReLU(alpha=0.01)(convsecond_1)

    convsecond_2 = Conv2D(64, (1, 1), padding='same')(conv_first1)
    convsecond_2 = LeakyReLU(alpha=0.01)(convsecond_2)
    convsecond_2 = Conv2D(64, (5, 1), padding='same')(convsecond_2)
    convsecond_2 = LeakyReLU(alpha=0.01)(convsecond_2)

    convsecond_3 = MaxPooling2D((3, 1), strides=(1, 1), padding='same')(conv_first1)
    convsecond_3 = Conv2D(64, (1, 1), padding='same')(convsecond_3)
    convsecond_3 = LeakyReLU(alpha=0.01)(convsecond_3)
    
    convsecond_output = concatenate([convsecond_1, convsecond_2, convsecond_3], axis=3)
    print(convsecond_output.shape)
    # use the MC dropout here
    conv_reshape = Reshape((int(convsecond_output.shape[1]) * int(convsecond_output.shape[3]),))(convsecond_output)

    conv_reshape = Dropout(rate=0.2)(conv_reshape)
    out = Dense(3, activation='softmax')(conv_reshape)
    model = Model(inputs=input_lmd, outputs=out)
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

ligh_deeplob = create_light_deeplob(100, 40, 64)
ligh_deeplob.summary()

In [None]:
# Prepare path to store tensorboard logs
root_logdir = os.path.join(os.curdir, "my_logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

model_name = "dynz_score_lob_v3_10s.h5"

In [None]:
# Learning rate callback. Reduce on Plateau multiply the lr by the factor if val loss does not improve for n epochs (patience)
lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                   factor=0.5, 
                                                   patience=5)

# Checkpoint callback. Saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(model_name,
                                                 save_best_only=True,
                                                 save_weights_only=False,
                                                 verbose=1,
                                                 period=1) # every epoch

# Early stopping callback. When sees no progress on the validation set
es_callback = tf.keras.callbacks.EarlyStopping(patience=20,
                                               restore_best_weights=True)

# Tensorboard callback
tb_callback = tf.keras.callbacks.TensorBoard(run_logdir)

# Train and Test time series generators
generator_train = TimeseriesGenerator(
    train_depth_dyn,
    encoded_train_labels,
    100,
    batch_size=64,
    shuffle=False
)

# to be replaced with validation?
generator_test = TimeseriesGenerator(
    test_depth_dyn,
    encoded_test_labels,
    100,
    batch_size=64,
    shuffle=False
)

# Train the model
ligh_deeplob.fit(generator_train, 
            epochs=200, 
            verbose=1,
            validation_data=generator_test,
            callbacks=[lr_callback, cp_callback, es_callback, tb_callback])
# This may generate warnings related to saving the state of the optimizer.
# These warnings (and similar warnings throughout this notebook)
# are in place to discourage outdated usage, and can be ignored.

## Predictions

In [None]:
# Model predictions

# Load the previously saved weights
deep_lob_loaded = tf.keras.models.load_model(model_name)

generator_test = TimeseriesGenerator(
    test_depth_dyn,
    encoded_test_labels,
    100,
    batch_size=64,
    shuffle=False
)

# Re-evaluate the model
loss, acc = deep_lob_loaded.evaluate(generator_test, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

In [None]:
# reverse engineer how to_categorical have assigned labels and map them back

def back_to_labels(x):

    if x == 0:
        return 0

    elif x == 1:
        return 1

    elif x == 2:
        return -1
        
predictions = deep_lob_loaded.predict(generator_test, verbose=1)
#print(np.hstack([np.where(predictions==0)[0][0], np.where(predictions==1)[0][0], np.where(predictions==-1)[0][0]]))# first element

map_labels = np.vectorize(back_to_labels)
mapped_labels = map_labels(np.argmax(predictions,axis=1))

In [None]:
predictions_pctg_df = pd.DataFrame(predictions, columns=[0, 1, -1]) # output pctg at each timestep

In [None]:
plot_data(mid_px_test_dyn[100:].reset_index()[0], pd.Series(mapped_labels), 0, 30100, train_test_switch='predictions',y0=0)

In [None]:
# plot predicted labels, real labels, difference between the two and probability of a "buy" label
fig = make_subplots(rows=1, cols=1,specs=[[{"secondary_y": True}]])

fig.update_layout(title=f'<b>Label Comparison</b>', title_x=0.5)

fig.add_trace(go.Scatter(y=mapped_labels[0:3000]+5, name='predicted'))
fig.add_trace(go.Scatter(y=labels_dyn_test[100:][0:3000].values-5, name='labels'))
fig.add_trace(go.Scatter(y=mapped_labels[0:3000] - labels_dyn_test[100:][0:3000].values, name='predictions - labels'))

fig.add_trace(go.Scatter(y=predictions_pctg_df[1][0:3000], name = 'Probability of 1'), secondary_y=True)

fig.update_layout(width=1200, height=600) # plot labels background