In [None]:
%load_ext autoreload
%autoreload 2
import LOBData
from datetime import timedelta, datetime
import pandas as pd
import plotly.express as px
import gzip
import os
import gzip
import json
import numpy as np
from func_tools import normalize, get_labels, cnn_data_reshaping, reshape_lob_levels, plot_labels, label_insights, get_pnl

import time

In [None]:
# preprocessing inputs
security = 'USDT_BTC'
raw_data_path = f'S3_data' # where json data is stored
root_caching_folder = "Processed_Data"
frequency = timedelta(seconds=10)
norm_type = 'dyn_z_score'

# labelling inputs
k_plus = 30#60
k_minus = 30#60
alpha = 0.001#0.0005
roll = 7200 * 6 # step from minute to 10 second data
# pull data from S3
#download_s3_data('limit-order-books-data-po-limitorderbooksnapshots-v25ungbmmak9', pair)

# Data import - needs to be adjusted importing from several files using Dask
data = pd.read_csv(f'{root_caching_folder}/{security}/data-cache-10s.csv', index_col=0)
lob_depth = data['Level'].max() + 1 # number of levels of order book

In [None]:
# Train test split
train_test_split = int((data.shape[0] / lob_depth) * 0.7) # slice reference for train and test
train_timestamps = data['Datetime'].unique()[:train_test_split]
test_timestamps = data['Datetime'].unique()[train_test_split:]

train_cached_data = data[data['Datetime'].isin(train_timestamps)].set_index(['Datetime', 'Level'])
test_cached_data = data[data['Datetime'].isin(test_timestamps)].set_index(['Datetime', 'Level'])

print(f'Train dataset shape: {train_cached_data.shape} - Test dataset shape: {test_cached_data.shape}')

In [None]:
# # z-score formula
# mean_rw = np.mean(stacked_series.iloc[0:roll * 10 * 2])
# std_rw = np.std(stacked_series.iloc[0:roll * 10 * 2])
# new_data = stacked_series[(roll * 10 * 2):(roll * 20 * 2) + (10 * 2)]
# z_rw = (new_data - mean_rw) / std_rw


In [None]:
class DataNormalization:

    def __init__(self, ts, roll, ob_levels, start=0):
        ''' 
            ts: pd.Series or pd.Dataframe. If dataframe, need to have cols that can be normalized
                together, like all prices or sizes

            roll: int, rolling window (depends on frequency of data passed)

            ob_levels: int, orderbook depth. Assumed to be constant throughtout all timeseries
            
            start: int, at which point of the timeseries the rolling start. Has to be a multiple of 
                    ob_levels * n df columns
        '''
        self.ts = ts
        self.roll = roll
        self.ob_levels = ob_levels
        self.ts_shape = self.ts.shape[1]
        self.roll_window = self.roll * self.ob_levels * self.ts_shape
        self.roll_step = self.ob_levels * self.ts_shape
        self.start = start
        self.ts_stacked = self.get_ts_stack() # stack dataframe as default
        self.new_data = pd.Series()
        self.dyn_ts = pd.Series()
    
    def get_ts_stack(self):
        ''' Flatten dataframe into a series if more than 1 column is passed '''
        if self.ts_shape > 1:
            self.ts_stacked = self.ts.stack()
            #print(self.ts_stacked)
        else:
            self.ts_stacked = self.ts
        return self.ts_stacked

    def get_new_data(self):
        ''' Add 1 roll step to the self.start variable, to get next timestep from dataframe '''
        self.start += self.roll_step
        self.new_data = self.ts_stacked.iloc[(self.start+self.roll_window):(self.start+self.roll_window+self.roll_step)]
        return self.new_data

    def get_one_dyn_z(self):
        ''' Calculate 1 period dynamic z score - 1/100th of a second'''
        mean_rw = np.mean(self.ts_stacked.iloc[self.start:self.roll_window+self.start])
        std_rw = np.std(self.ts_stacked.iloc[self.start:self.roll_window+self.start])
        # self.start is updated in get_new_data, so get_new_data() has to be executed after mean_rw and std_rw
        self.new_data = self.get_new_data() 
        #print(self.ts_stacked.iloc[self.start:self.roll_window+self.start])
        #print(self.new_data)
        z_rw = (self.new_data - mean_rw) / std_rw
        return z_rw
        
    def get_ts_dyn_z(self):
        ''' Loop through all time series - much slower than pandas rolling implementation '''
        while self.roll_window+self.start <= self.ts_stacked.shape[0]:
            self.dyn_ts = pd.concat([self.dyn_ts, self.get_one_dyn_z()])
        return self.dyn_ts


In [None]:
norm_class = DataNormalization(test_cached_data[['Ask_Price', 'Bid_Price']], roll, 10, 0)

In [None]:
start_time = time.time()
df = norm_class.get_ts_dyn_z()
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
norm_class.start

In [None]:
start_time = time.time()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
norm_class.ts_stacked.shape

In [None]:
norm_class.roll_window + norm_class.start

In [None]:
test_cached_data.iloc[[1, 3]]

In [None]:
pd.concat([dyn_df,dyn_df2,pd.Series()])

In [None]:
dyn_df2

In [None]:
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
norm_class.get_one_dyn_z()
dyn_df2 = norm_class.get_one_dyn_z()

In [None]:
pd.Series([])

In [None]:
#norm_class = DataNormalization(test_cached_data[['Ask_Size', 'Bid_Size']], roll, 10, 0)
dyn_df = norm_class.dyn_z()

In [None]:
dyn_df.reset_index().pivot_table(index=['Datetime', 'Level'], columns='level_2', values=0, dropna=True)

In [None]:
test_dyn_df = pd.read_csv(f'{root_caching_folder}/{security}/TEST-{lob_depth}-{norm_type}-{roll}.csv')

In [None]:
test_dyn_df[test_dyn_df['Datetime']=='2020-08-14 19:27:00']

In [None]:
stacked_series.iloc[0:roll * 10 * 2]

In [None]:
normalize(test_cached_data[['Ask_Price', 'Bid_Price']], lob_depth, 'dyn_z_score', roll)

In [None]:
preprocessing = LOBData.LOBData(raw_data_path, security, root_caching_folder, frequency=timedelta(seconds=10), levels=100, resampled_cache='1min')
raw_data = preprocessing.get_LOB_data()

In [None]:
df1 = pd.read_csv(f'{root_caching_folder}/{security}/2020-11-12-original_frequency.csv.gz')

In [None]:
df1['Datetime'] = pd.to_datetime(df1['Datetime'])

In [None]:
df1 = df1.sort_values(by=['Sequence', 'Datetime'])

In [None]:
df_0 = df1[df1['Level']==0]

In [None]:
(df_0['Datetime'] - df_0['Datetime'].shift()).dt.total_seconds()[:50]#.plot()

In [None]:
df1[df1['Level']==0]

In [None]:
(df1['Datetime'] - df1['Datetime'].shift()).dt.total_seconds().plot()

In [None]:
(df1['Datetime'] - df1['Datetime'].shift()).dt.total_seconds().plot()

In [None]:
(df1['Datetime'] - df1['Datetime'].shift()).dt.total_seconds().sort_values()

In [None]:
pd.read_csv(f'{root_caching_folder}/{security}/test-data-cache_1min_2020-11-11.csv')

In [None]:
(8571500) / 144000

In [None]:
with gzip.open(f'{raw_data_path}/{security}/2020/11/11/20201111_00-0-0.json.gz', 'r') as f:
    json_string = f.read().decode('utf-8')
    frozen = json_string.count('"isFrozen": "1"')
    if frozen > 0:
        print(f'Frozen {frozen} snapshots')
raw_data = json.loads(json_string)

In [None]:
raw_data.update(raw_data2)

In [None]:
type(raw_data)

In [None]:
#json_file['BTC_ETH-20201111_000956']

In [None]:
processed_data = []
# TODO - datetime as keys to sort later
for key in raw_data.keys():
    # unravel the nested json structure into a more manageable list of lists
    [processed_data.append(list(zip(
        [i[0[0:3000]] for i in raw_data.get(key)['asks'][0:levels]], # ask px
        [i[1] for i in raw_data.get(key)['asks'][0:levels]], # ask size
        [i[0] for i in raw_data.get(key)['bids'][0:levels]], # bid px
        [i[1] for i in raw_data.get(key)['bids'][0:levels]], # bid size
        list(range(levels)), # ob level - assuming same for both
        [raw_data.get(key)['seq']] * levels,
        [key[-15:]] * levels  # datetime part of the key
    )))]
# TODO sort datetime keys and cache one day as csv?


In [None]:
#sorted(raw_data.keys(), reverse=True)

In [None]:
pd.DataFrame(processed_data)

In [None]:
#[processed_data, raw_data.get(key)['seq']]

In [None]:
# unravel nested structure and force data types
df = pd.DataFrame([y for x in processed_data for y in x], #flatten the list of lists structure
                columns = ['Ask_Price', 'Ask_Size', 'Bid_Price', 'Bid_Size','Level', 'Sequence','Datetime'])

df['Ask_Price'] = df['Ask_Price'].astype('float64')
df['Ask_Size'] = df['Ask_Size'].astype('float64')
df['Bid_Price'] = df['Bid_Price'].astype('float64')
df['Bid_Size'] = df['Bid_Size'].astype('float64')
df['Level'] = df['Level'].astype('int64')
df['Sequence'] = df['Sequence'].astype('int64')
df['Datetime'] = pd.to_datetime(df['Datetime'], format='%Y%m%d_%H%M%S')

In [None]:
np.array(processed_data).shape

In [None]:
preprocessing = LOBData.LOBData(raw_data_path, security, root_caching_folder, frequency=timedelta(seconds=10), levels=100, resampled_cache='1min')
raw_data = preprocessing.get_LOB_data()

In [None]:
df1 = pd.read_csv(f'{root_caching_folder}/{security}/2020-11-11 00:00:00-original_frequency.csv.gz')

In [None]:
df2= pd.read_csv(f'{root_caching_folder}/{security}/2020-11-12 00:00:00-original_frequency.csv.gz')


In [None]:
df1[df1['Level']==0].head(5000)['Bid_Price'].plot()

In [None]:
df2[df2['Level']==0].head(5000)['Bid_Price'].plot()


In [None]:
df2

In [None]:
day = 'BTC_ETH/20201112_14-0-3.json.gz'

In [None]:
day.split(".")[0].split("/")[1].split("_")[0]

In [None]:
temp_df = pd.read_csv(f'{root_caching_folder}/{security}/original_frequency.csv')

In [None]:
temp_df.sort_values(by='Sequence')

In [None]:
deltas = pd.to_datetime(raw_data['Datetime'].unique()) - pd.to_datetime(pd.Series(raw_data['Datetime'].unique()).shift(periods=1))
print(deltas.describe()), print('######'), deltas.sort_values(ascending=False).tail(50)

In [None]:
df_time_anomalyes = pd.concat([pd.Series(raw_data['Datetime'].unique()), deltas], axis=1)
df_time_anomalyes[df_time_anomalyes.index.isin(deltas[deltas != pd.Timedelta(seconds=10)].index)]

In [None]:
fig = px.histogram(deltas, log_y=True)
fig.show()

In [None]:
minute_cache.head()

In [None]:
fig = px.line(minute_cache[minute_cache.Level == 0], y='Ask_Size', x='Datetime')
fig.show()

In [None]:
minute_cache = pd.read_csv(f'{root_caching_folder}/{security}/test-data-cache-1m.csv')

In [None]:
deltas_minute = pd.to_datetime(minute_cache['Datetime'].unique()) - pd.to_datetime(pd.Series(minute_cache['Datetime'].unique()).shift(periods=1))
print(deltas_minute.describe()), print('######'), deltas_minute.sort_values(ascending=False).head(50)

## Heavy lifting

In [None]:
import pandas as pd
import numpy as np
import func_tools as ft

In [None]:
    # initiate values to print out under dash components
    k_plus_window_text = ''
    k_minus_window_text = ''
    alpha_thresh_text = ''
    tr_fee_text = ''
    
    # data reading
    data = pd.read_csv(f'{root_caching_folder}/{security}/data-cache-1m.csv', index_col=0)
    data = data[(data.Datetime >= start_date) & (data.Datetime <= end_date)]

    data_top = data[data.Level  == 0]#.reset_index() #fix double index issue. Do it the func tool way, cause that's the one that changes
    data_top['Mid_Price'] = (data_top['Ask_Price'] + data_top['Bid_Price']) / 2
    data_top['Spread'] = (data_top['Ask_Price'] - data_top['Bid_Price']) / data_top['Mid_Price']
    data_grouped = data_top.groupby('Datetime').agg(
        {'Ask_Size':'sum',
        'Bid_Size':'sum',
        'Spread': 'min'
        }
    )
    #bbo_df.index = bbo_df.index.set_names(['date'])
    #bbo_df = bbo_df.reset_index()
    #print(bbo_df.tail(10))
    
    #print(security)
    
    px_chart = make_subplots(rows=2, cols=1, subplot_titles=("", "10 levels depth and spread"), shared_xaxes=True,
                            row_heights=[0.6, 0.4], vertical_spacing = 0.10, 
                            specs=[[{"secondary_y": True}], [{"secondary_y": True}]])

    pnl_chart = px.line(height=200)
    #for i in range(len(norm_type)):
    sec_axis_check = False

    # add depth and spread to main chart
    px_chart.add_trace(go.Scatter(x=data_grouped.index.values, y=data_grouped['Bid_Size'].values,  name='Bid depth - 10 levels',
                                marker=dict(color='#81C342')), row=2, col=1, secondary_y=False) # fill down to xaxis

    px_chart.add_trace(go.Scatter(x=data_grouped.index.values, y=-data_grouped['Ask_Size'].values,  name='Ask depth - 10 levels',
                                marker=dict(color='#EB2030')), row=2, col=1, secondary_y=False) # fill down to xaxis

    px_chart.add_trace(go.Scatter(x=data_grouped.index.values, y=data_grouped['Spread'].values,  name='Best bid-offer spread',
                                marker=dict(color='#335eff')), row=2, col=1, secondary_y=True) # fill down to xaxis



    px_chart.add_trace(go.Scatter(x=data_top['Datetime'], y=data_top['Mid_Price'], name='price', marker=dict(color='#000000')), 
                        row=1, col=1, secondary_y=False)
    px_chart.update_yaxes(title_text="$ price", secondary_y=sec_axis_check, row=1, col=1)
    sec_axis_check = True



    # if 'z_score' in norm_type:
    #     norm_ts = ft.normalize(data[['Ask_Price', 'Bid_Price']], ob_levels=ob_levels, norm_type='z_score')
    #     px_chart.add_trace(go.Scatter(x=data.index, y=norm_ts, name='z-score', marker=dict(color='#19D3F3')), 
    #                         row=1, col=1, secondary_y=sec_axis_check)
    #     px_chart.update_yaxes(title_text="normalized price", secondary_y=sec_axis_check, row=1, col=1)

    #if 'dyn_z_score' in norm_type:
    data_ft = data.set_index(['Datetime', 'Level'])
    norm_ts_px = ft.normalize(data_ft[['Ask_Price', 'Bid_Price']], ob_levels=ob_levels, norm_type='dyn_z_score', roll=norm_window)
    norm_ts_vol = ft.normalize(data_ft[['Ask_Size', 'Bid_Size']], ob_levels=ob_levels, norm_type='dyn_z_score', roll=norm_window) # get norm volumes
    test_dyn_df = pd.concat([norm_ts_px, norm_ts_vol], axis=1).reset_index() # concat along row index
    depth_dyn, dt_index_dyn = ft.reshape_lob_levels(test_dyn_df, output_type='array') # 1 train dataset
    mid_px_train_dyn = pd.Series((depth_dyn[:,2] + depth_dyn[:,0]) / 2) # 2


In [None]:
# Notes
# specify number of levels as well as frequency in caches
# add frequency as class parameter

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import plotly_express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from datetime import datetime, timedelta

import tensorflow
# for device in tensorflow.config.experimental.list_physical_devices('GPU'):
#     tensorflow.config.experimental.set_memory_growth(device, True)
from keras.utils import np_utils
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Reshape, Conv2D, LSTM, Dense, MaxPooling2D, LeakyReLU, concatenate, Dropout
from tensorflow.keras.optimizers import Adam

import os


import LOBData
#from func_tools import normalize, get_labels, cnn_data_reshaping, reshape_lob_levels, plot_labels, label_insights

In [None]:
gpus = tensorflow.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tensorflow.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tensorflow.config.experimental.VirtualDeviceConfiguration(memory_limit=6024)])
    logical_gpus = tensorflow.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)
tensorflow.test.gpu_device_name()

In [None]:
tensorflow.test.gpu_device_name()

In [None]:
# def fun(x):
#     x[0]=5
#     return x
# g = [10,11,12]

# print(g)

# f = fun(g)
# print(f)

In [None]:
# names = ['Amir', 'Barry', 'Char', 'Delp']
# print(names[-1])

In [None]:
# preprocessing inputs
security = 'USDC_BTC'
raw_data_path = f'Pawel_test' # where json data is stored
root_caching_folder = "Processed_Data"
frequency = timedelta(seconds=10)
levels = 10


# labelling inputs
k_plus = 60
k_minus = 60
alpha = 0.0005
roll = 7200
# pull data from S3
#download_s3_data('limit-order-books-data-po-limitorderbooksnapshots-v25ungbmmak9', pair)

In [None]:
# Workflow
preprocess = True

if preprocess == True:

    preprocessing = LOBData.LOBData(raw_data_path, security, root_caching_folder, frequency=timedelta(seconds=10), levels=10)
    raw_data = preprocessing.get_LOB_data()
    raw_data.to_csv(f'Processed_Data/{security}/data-cache-10s-test.csv') # save raw data full depth 10 secs as csv, can be a big file
    
    data = pd.read_csv(f'{root_caching_folder}/{security}/data-cache-10s-test.csv', index_col=0)

elif preprocess == False:
    data = pd.read_csv(f'{root_caching_folder}/{security}/data-cache-10s-test.csv', index_col=0)

In [None]:
data = pd.read_csv(f'{root_caching_folder}/{security}/test-data-cache-1m.csv', index_col=0)

In [None]:
#data['Datetime'] = pd.to_datetime(data['Datetime'], format='%Y%m%d_%H%M%S')
data['Datetime'] = pd.to_datetime(data['Datetime'], format='%Y-%m-%d %H:%M:%S')

In [None]:
resampled_data = data.groupby([pd.Grouper(key='Datetime', freq='1h'), pd.Grouper(key='Level')]).last().reset_index()

In [None]:
df_partitions = [group for group in resampled_data.groupby([resampled_data.Datetime.dt.year, resampled_data.Datetime.dt.month, resampled_data.Datetime.dt.day])]

In [None]:
caching_folder = f'{root_caching_folder}/{security}'
cache_file = f'{caching_folder}/test-data-cache-1h'

In [None]:
group[0]

In [None]:
for group in df_partitions:
    partition_date = '-'.join([str(x) for x in group[0]])
    group[1].to_csv(f'{cache_file}_{partition_date}.csv')

In [None]:
'-'.join([str(x) for x in group[0]])

In [None]:
print(data.shape)
data.head()

In [None]:
# data['Mid_Price'] = (data['Ask_Price'] + data['Bid_Price']) / 2
# data['Spread'] = (data['Ask_Price'] - data['Bid_Price']) / data['Mid_Price']
# data_grouped = data.groupby('Datetime').agg({'Ask_Size':'sum',
#                               'Bid_Size':'sum',
#                               'Spread': ['min', 'max']
#                              })
# data_grouped.head()

### Train - Test split

In [None]:
lob_depth = data['Level'].max() + 1 # number of levels of order book
train_test_split = int((data.shape[0] / lob_depth) * 0.7) # slice reference for train and test
train_timestamps = data['Datetime'].unique()[:train_test_split]
test_timestamps = data['Datetime'].unique()[train_test_split:]

train_cached_data = data[data['Datetime'].isin(train_timestamps)].set_index(['Datetime', 'Level'])
test_cached_data = data[data['Datetime'].isin(test_timestamps)].set_index(['Datetime', 'Level'])

print(f'Train dataset shape: {train_cached_data.shape} - Test dataset shape: {test_cached_data.shape}')

### Normalize & check how train and test distributions differ

In [None]:
# # use normalize to calculate standardized z score version of the train data
# train_z_prices = normalize(train_cached_data[['Ask_Price', 'Bid_Price']], 10, norm_type='z_score', roll=0) # get norm prices
# train_z_volumes = normalize(train_cached_data[['Ask_Size', 'Bid_Size']], 10, norm_type='z_score', roll=0) # get norm volumes
# train_z_df = pd.concat([train_z_prices, train_z_volumes], axis=1).reset_index() # concat along row index


# # use normalize to calculate standardized z score version of the test data
# test_z_prices = normalize(test_cached_data[['Ask_Price', 'Bid_Price']], 10, norm_type='z_score', roll=0) # get norm prices
# test_z_volumes = normalize(test_cached_data[['Ask_Size', 'Bid_Size']], 10, norm_type='z_score', roll=0) # get norm volumes
# test_z_df = pd.concat([test_z_prices, test_z_volumes], axis=1).reset_index() # concat along row index


# display(train_z_df.describe())
# display(test_z_df.describe())

In [None]:

# use normalize to calculate standardized z score version of the train data
train_dyn_prices = normalize(train_cached_data[['Ask_Price', 'Bid_Price']], ob_levels=levels, norm_type='dyn_z_score', roll=roll) # get norm prices
train_dyn_volumes = normalize(train_cached_data[['Ask_Size', 'Bid_Size']], ob_levels=levels, norm_type='dyn_z_score', roll=roll) # get norm volumes
train_dyn_df = pd.concat([train_dyn_prices, train_dyn_volumes], axis=1).reset_index() # concat along row index


# use normalize to calculate standardized z score version of the test data
test_dyn_prices = normalize(test_cached_data[['Ask_Price', 'Bid_Price']], ob_levels=levels, norm_type='dyn_z_score', roll=roll) # get norm prices
test_dyn_volumes = normalize(test_cached_data[['Ask_Size', 'Bid_Size']], ob_levels=levels, norm_type='dyn_z_score', roll=roll) # get norm volumes

test_dyn_df = pd.concat([test_dyn_prices, test_dyn_volumes], axis=1).reset_index() # concat along row index


display(train_dyn_df.describe())
display(test_dyn_df.describe())

### Reshape and Label

#### z-score

In [None]:
# # Reshape to a format suitable for deep lob like training
# train_depth_z, train_dt_index_z = reshape_lob_levels(train_z_df, output_type='array')
# test_dept_z, test_dt_index_z = reshape_lob_levels(test_z_df, output_type='array')

# # generate labels from z score mid px. Get mid stacking train and test bbo
# mid_px_series_z = (pd.Series(np.hstack([train_depth_z[:,2], test_dept_z[:,2]])) + pd.Series(np.hstack([train_depth_z[:,0], test_dept_z[:,0]])))/2

# # Decide whether to get labels from mid px or from standardized data
# labels_z = get_labels(mid_px_series_z, k_plus, k_minus, alpha, long_only=False)

#### dynamic z-score

In [None]:
# Reshape to a format suitable for deep lob like training
train_depth_dyn, train_dt_index_dyn = reshape_lob_levels(train_dyn_df, output_type='array')
test_depth_dyn, test_dt_index_dyn = reshape_lob_levels(test_dyn_df, output_type='array')

# # generate labels from z score mid px. Get mid stacking train and test bbo
# mid_px_series_dyn = (pd.Series(np.hstack([train_depth_dyn[:,2], test_dept_dyn[:,2]])) + pd.Series(np.hstack([train_depth_dyn[:,0], test_dept_dyn[:,0]])))/2

# # Decide whether to get labels from mid px or from standardized data
# labels_dyn = get_labels(mid_px_series_dyn, k_plus, k_minus, alpha, long_only=False)

In [None]:
# generate labels from z score mid px. Get mid stacking train and test bbo
# train
mid_px_train_dyn = pd.Series((train_depth_dyn[:,2] + train_depth_dyn[:,0]) / 2)
labels_dyn_train = get_labels(mid_px_train_dyn, k_plus, k_minus, alpha, long_only=False)
# test
mid_px_test_dyn = pd.Series((test_depth_dyn[:,2] + test_depth_dyn[:,0]) / 2)
labels_dyn_test = get_labels(mid_px_test_dyn, k_plus, k_minus, alpha, long_only=False)

### Visualize data and labels

In [None]:
# # Labels
# print('Train Labels')
# train_transact_z = label_insights(labels_z[:train_test_split])
# print('\nTest Labels')
# test_transact_z = label_insights(labels_z[train_test_split:])
# print(f'\nLabels Train as pctg of total: {test_transact_z/(test_transact_z+train_transact_z)}')

In [None]:
# Labels
print('Train Labels')
train_transact_dyn = label_insights(labels_dyn_train)
print('\nTest Labels')
test_transact_dyn = label_insights(labels_dyn_test)
print(f'\nLabels Train as pctg of total: {test_transact_dyn/(test_transact_dyn+train_transact_dyn)}')

In [None]:
# timeseries useful for plotting
mid_px_df = data[data['Level']==0].reset_index()
mid_px_df['Mid_Price'] = (mid_px_df['Ask_Price'] + mid_px_df['Bid_Price']) / 2

# datetime_index = np.hstack([train_dyn_df[train_dyn_df['Level']==0]['Datetime'], test_dyn_df[test_dyn_df['Level']==0]['Datetime']])

# indexed_labels = pd.Series(data=labels_dyn.values, index=pd.Index(datetime_index))


In [None]:
def plot_data(start, end, y0=0):
    # Plot Data

    fig = make_subplots(rows=1, cols=1,specs=[[{"secondary_y": True}]])

    fig.update_layout(title='<b>Visual check: values and labels</b>', title_x=0.5)

    # fig.add_trace(go.Scatter(y=mid_px_series_z.values[:train_test_split], name='mix_px_z_train'))

    # fig.add_trace(go.Scatter(y=mid_px_series_z.values[train_test_split:], x=np.arange(train_test_split,mid_px_series_z.shape[0]), name='mix_px_z_test'))

    # fig.add_trace(go.Scatter(y=mid_px_df['Mid_Price'].values[start:end+roll], x=mid_px_df['Datetime'][start:end+roll], name='mix_px'), secondary_y=True)

    fig.add_trace(go.Scatter(y=mid_px_train_dyn.values[start:end], x=mid_px_train_dyn.index[start:end], name='mix_px_dyn_train'))   

    fig.add_trace(go.Scatter(y=labels_dyn_train[start:end], name='labels_encoded'), secondary_y=True)

    # fig.add_trace(go.Scatter(y=mid_px_series_dyn.values[:train_test_split-roll], x=datetime_index[:train_test_split-roll], name='mix_px_dyn_train'))
    # fig.add_trace(go.Scatter(y=mid_px_series_dyn.values[train_test_split-roll:], x=datetime_index[train_test_split-roll:], name='mix_px_dyn_test'))
    
    #x=np.arange(train_test_split,mid_px_series_dyn.shape[0]
    background_color = plot_labels(labels_dyn_train[start:end], y0)
    #fig.data[0].update(xaxis='x1')
    fig.update_layout(shapes=background_color)
    fig.update_layout(width=1200, height=600) # plot labels background
    
    fig.update_layout(xaxis2= {'anchor': 'x','overlaying': 'x', 'side': 'top'},
                  yaxis_domain=[0, 1]);
    return fig

In [None]:
plot_data(0,10000,y0=0)
# labels concern: not clear how the labelling will capture sudden short lived drops. Data gaps? Also not clear how labels on mid px rather than norm mid would compare


In [None]:
dist_df = pd.DataFrame(mid_px_series, columns=['mid_px_z'])
dist_df['split'] =  dist_df.reset_index()['index'].apply(lambda x: 'train' if x <= train_test_split else 'test')

fig = px.histogram(dist_df, x='mid_px_z', color='split', nbins=100)
fig.show()

In [None]:
# build sine waves with depth - 4,5 levels to see if the predictions are ok asynchronous

In [None]:
# Augment data to prepare for 1D convolution (add extra dimension back for 2D)

# train_X, train_Y = cnn_data_reshaping(train_depth, labels_reshaped[:train_test_split], T, conv_type='2D')
# test_X, test_Y = cnn_data_reshaping(test_depth, labels_reshaped[train_test_split:], T, conv_type='2D')

### Model training

In [None]:
# function to try to see if the model is capturing relations in the data

In [None]:
def create_deeplob(T, NF, number_of_lstm):
    input_lmd = Input(shape=(T, NF, 1))
    
    # build the convolutional block
    conv_first1 = Conv2D(32, (1, 2), strides=(1, 2))(input_lmd)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(32, (1, 2), strides=(1, 2))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(32, (1, 10))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    
    # build the inception module
    convsecond_1 = Conv2D(64, (1, 1), padding='same')(conv_first1)
    convsecond_1 = LeakyReLU(alpha=0.01)(convsecond_1)
    convsecond_1 = Conv2D(64, (3, 1), padding='same')(convsecond_1)
    convsecond_1 = LeakyReLU(alpha=0.01)(convsecond_1)

    convsecond_2 = Conv2D(64, (1, 1), padding='same')(conv_first1)
    convsecond_2 = LeakyReLU(alpha=0.01)(convsecond_2)
    convsecond_2 = Conv2D(64, (5, 1), padding='same')(convsecond_2)
    convsecond_2 = LeakyReLU(alpha=0.01)(convsecond_2)

    convsecond_3 = MaxPooling2D((3, 1), strides=(1, 1), padding='same')(conv_first1)
    convsecond_3 = Conv2D(64, (1, 1), padding='same')(convsecond_3)
    convsecond_3 = LeakyReLU(alpha=0.01)(convsecond_3)
    
    convsecond_output = concatenate([convsecond_1, convsecond_2, convsecond_3], axis=3)
    print(convsecond_output.shape)
    # use the MC dropout here
    conv_reshape = Reshape((int(convsecond_output.shape[1]), int(convsecond_output.shape[3])))(convsecond_output)

    # build the last LSTM layer
    conv_lstm = LSTM(number_of_lstm)(conv_reshape)

    # build the output layer
    out = Dense(3, activation='softmax')(conv_lstm)
    model = Model(inputs=input_lmd, outputs=out)
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

# deeplob = create_deeplob(100, 40, 64)
# deeplob.summary()

In [None]:
def create_light_deeplob(T, NF, number_of_lstm):
    
    input_lmd = Input(shape=(T, NF, 1))
    # build the convolutional block
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(input_lmd)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)

    # build the convolutional block
    conv_first1 = Conv2D(32, (1, 2), strides=(1, 2))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(64, (1, 10))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    
    # build the inception module
    convsecond_1 = Conv2D(64, (1, 1), padding='same')(conv_first1)
    convsecond_1 = LeakyReLU(alpha=0.01)(convsecond_1)
    convsecond_1 = Conv2D(64, (3, 1), padding='same')(convsecond_1)
    convsecond_1 = LeakyReLU(alpha=0.01)(convsecond_1)

    convsecond_2 = Conv2D(64, (1, 1), padding='same')(conv_first1)
    convsecond_2 = LeakyReLU(alpha=0.01)(convsecond_2)
    convsecond_2 = Conv2D(64, (5, 1), padding='same')(convsecond_2)
    convsecond_2 = LeakyReLU(alpha=0.01)(convsecond_2)

    convsecond_3 = MaxPooling2D((3, 1), strides=(1, 1), padding='same')(conv_first1)
    convsecond_3 = Conv2D(64, (1, 1), padding='same')(convsecond_3)
    convsecond_3 = LeakyReLU(alpha=0.01)(convsecond_3)
    
    convsecond_output = concatenate([convsecond_1, convsecond_2, convsecond_3], axis=3)
    print(convsecond_output.shape)
    # use the MC dropout here
    conv_reshape = Reshape((int(convsecond_output.shape[1]) * int(convsecond_output.shape[3]),))(convsecond_output)

        # build the last LSTM layer
    #conv_lstm = LSTM(number_of_lstm)(conv_reshape)
    #dense_l = Dense(100, activation='softmax')(conv_reshape)
    # build the output layer
    conv_reshape = Dropout(rate=0.2)(conv_reshape)
    out = Dense(3, activation='softmax')(conv_reshape)
    model = Model(inputs=input_lmd, outputs=out)
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

ligh_deeplob = create_light_deeplob(100, 40, 64)

In [None]:
ligh_deeplob.summary()

In [None]:
# for i in range(len(generator)):
# 	x, y = generator[i]
# 	print('%s => %s' % (x, y))

In [None]:
# x = train_depth.reshape(train_depth.shape + (1,))

# y = np_utils.to_categorical(np.array(labels_reshaped),3)[:train_test_split]

# generator = TimeseriesGenerator(
#     x,
#     y[:train_test_split],
#     100,
#     batch_size=64,
#     shuffle=False
# )

#history = deeplob.fit(generator, epochs=20, verbose=1)#, validation_data=(, test_cat_Y))

In [None]:
# generator[0][0].shape #reshape
# generator[0][0].shape # no reshape
# generator2[0][0][0].shape

In [None]:
# 2 reshape to a format suitable for training
# 1 generate labels from z score mid px. Get mid stacking train and test bbo

# 3 generate labels from dyn z score mid px. Get mid stacking train and test bbo

# train
train_depth_dyn, train_dt_index_dyn = reshape_lob_levels(train_dyn_df, output_type='array') # 1
mid_px_train_dyn = pd.Series((train_depth_dyn[:,2] + train_depth_dyn[:,0]) / 2) # 2
labels_dyn_train = get_labels(mid_px_train_dyn, k_plus, k_minus, alpha, long_only=False) # 3

# test
test_depth_dyn, test_dt_index_dyn = reshape_lob_levels(test_dyn_df, output_type='array') # 1
mid_px_test_dyn = pd.Series((test_depth_dyn[:,2] + test_depth_dyn[:,0]) / 2) # 2
labels_dyn_test = get_labels(mid_px_test_dyn, k_plus, k_minus, alpha, long_only=False) # 3

In [None]:
# Set the scene for TensorBoard
import os
root_logdir = os.path.join(os.curdir, "my_logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

In [None]:
# checkpoint_path = "training_deep_lob/cp-{epoch:04d}.ckpt"
# checkpoint_dir = os.path.dirname(checkpoint_path)

In [None]:

# Create learning rate callback. Reduce on Plateau multiply the lr by the factor if val loss does not improve for n epochs (patience)
lr_callback = tensorflow.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

# Create a callback that saves the model's weights
cp_callback = tensorflow.keras.callbacks.ModelCheckpoint("light_lob_inc_dro_norm.h5",
                                                 save_best_only=True,
                                                 save_weights_only=False,
                                                 verbose=1,
                                                 period=1) # every epoch

# Create a callback for early stopping: when sees no progress on the validation set
es_callback = tensorflow.keras.callbacks.EarlyStopping(patience=20,
                                                       restore_best_weights=True)

# Create a callback for TensorBoard
tb_callback = tensorflow.keras.callbacks.TensorBoard(run_logdir)

# Save the weights using the `checkpoint_path` format
#deeplob.save_weights(checkpoint_path.format(epoch=0))

# Create data generator
#x = train_depth.reshape(train_depth.shape + (1,))
#y = np_utils.to_categorical(np.array(labels_reshaped),3)[:train_test_split]
categorical_labels = np_utils.to_categorical(np.array(labels_reshaped),3)
generator_train = TimeseriesGenerator(
    train_depth,
    categorical_labels[:train_test_split],
    100,
    batch_size=64,
    shuffle=False
)

generator_test = TimeseriesGenerator(
    test_depth,
    categorical_labels[train_test_split:],
    100,
    batch_size=64,
    shuffle=False
)

ligh_deeplob.fit(generator_train, 
            epochs=200, 
            verbose=1,
            validation_data=generator_test,
            callbacks=[lr_callback, cp_callback, es_callback, tb_callback])
# This may generate warnings related to saving the state of the optimizer.
# These warnings (and similar warnings throughout this notebook)
# are in place to discourage outdated usage, and can be ignored.


In [None]:
# deep_lob_loaded_mock = tensorflow.keras.models.load_model("model_mock_1.h5")
# predictions = deep_lob_loaded_mock.predict(generator_test, verbose=1)

In [None]:
# Load the previously saved weights
#model.load_weights(latest)

deep_lob_loaded = tensorflow.keras.models.load_model("light_lob.h5")

generator_test = TimeseriesGenerator(
    test_depth,
    np_utils.to_categorical(np.array(labels_reshaped),3)[train_test_split:],
    100,
    batch_size=64,
    shuffle=False
)

# Re-evaluate the model
loss, acc = deep_lob_loaded.evaluate(generator_test, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))


In [None]:
predictions = deep_lob_loaded.predict(generator_test, verbose=1)

In [None]:
predictions

In [None]:
array1 = np.array([0,0,1,1,0,0,-1])
array2 = np.array([0,0,-1,-1,0,1])
array3 = np.array([1,1,0,-1])

In [None]:
np_utils.to_categorical(array1,3)

In [None]:
np_utils.to_categorical(array2,3)

In [None]:
np_utils.to_categorical(array3,3)

In [None]:
np.argmax(np_utils.to_categorical(array2,3), axis = 1)

In [None]:
array2[np.argmax(np_utils.to_categorical(array2,3), axis = 1)]

In [None]:
def plot_data(px_series, labels):
    # Plot Data

    fig = make_subplots(rows=1, cols=1,specs=[[{"secondary_y": True}]])

    fig.update_layout(title='<b>Visual check: values and labels</b>', title_x=0.5)

    # add px series
    fig.add_trace(go.Scatter(y=px_series.values, name='mix_px_train'))

    background_color = plot_labels(labels)

    fig.update_layout(shapes=background_color, width=1200, height=600) # plot labels background

    return fig

In [None]:
# reverse engineer how to_categorical have assigned labels and map them back
print(np.hstack([np.where(labels==0)[0][0], np.where(labels==1)[0][0], np.where(labels==-1)[0][0]]))# first element

def back_to_labels(x):

    if x == 0:
        return 0

    elif x == 1:
        return 1

    elif x == 2:
        return -1

map_labels = np.vectorize(back_to_labels)
mapped_labels = map_labels(np.argmax(predictions,axis=1))