In [None]:
%load_ext autoreload
%autoreload 2
import sys, os, time, json, re
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from func_tools import standardize, cnn_data_reshaping, reshape_lob_levels, label_insights, back_to_labels, get_strategy_pnl,intraday_vol_ret 

import visualization_tools as viz_t

from labelling_class import Labels_Generator

#import multiprocessing
#import glob
import inspect

import plotly_express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import tensorflow as tf
from keras.utils import np_utils
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Reshape, Conv2D, LSTM, Dense, MaxPooling2D, BatchNormalization, LeakyReLU, concatenate, add, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict tf to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=6024)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)
tf.test.gpu_device_name()

## Data

In [None]:
experiments_folder = 'Experiments'
input_data_folder = f'{experiments_folder}/input'
cache_folder = f'{experiments_folder}/cache'

pair = 'USDT_BTC'
frequency = timedelta(seconds=10)
lob_depth = 10
length = 100
date_start = '2020_04_04'
date_end = '2021_01_03'
norm_type = 'dyn_z_score'
roll = 7200 * 6
batch_size = 64

# labelling inputs
k_plus = 15
k_minus = 15
alpha = 0.0045
trading_fee=0.000712
min_profit=0.002

frequency_seconds = int(frequency.total_seconds())
os.makedirs(f'{cache_folder}/{pair}', exist_ok=True)

# Data import - needs to be adjusted importing from several files using Dask
input_file_name = f'{pair}--{lob_depth}lev--{frequency_seconds}sec--{date_start}--{date_end}.csv.gz'

normalized_train_file = f'{cache_folder}/{pair}/TRAIN--{norm_type}-{roll}--{input_file_name}'
normalized_test_file = f'{cache_folder}/{pair}/TEST--{norm_type}-{roll}--{input_file_name}'

top_ob_train_file = f'{cache_folder}/{pair}/TRAIN_TOP--{input_file_name}'
top_ob_test_file = f'{cache_folder}/{pair}/TEST_TOP--{input_file_name}'

if os.path.isfile(normalized_test_file): # testing for one of cache files, assuming all were saved 
  print(f'Reading cached {normalized_train_file}')
  train_dyn_df = pd.read_csv(normalized_train_file)
  print(f'Reading cached {normalized_test_file}')
  test_dyn_df = pd.read_csv(normalized_test_file)

  print(f'Reading cached {top_ob_train_file}')
  top_ob_train = pd.read_csv(top_ob_train_file)
  print(f'Reading cached {top_ob_test_file}')
  top_ob_test = pd.read_csv(top_ob_test_file)  

else:
  print(f'Reading {input_data_folder}/{input_file_name}')
  data = pd.read_csv(f'{input_data_folder}/{input_file_name}', index_col=0)
  assert lob_depth == data['Level'].max() + 1 # number of levels of order book

  # Train test split
  train_test_split = int((data.shape[0] / lob_depth) * 0.7) # slice reference for train and test
  train_timestamps = data['Datetime'].unique()[:train_test_split]
  test_timestamps = data['Datetime'].unique()[train_test_split:]

  train_cached_data = data[data['Datetime'].isin(train_timestamps)].set_index(['Datetime', 'Level'])
  test_cached_data = data[data['Datetime'].isin(test_timestamps)].set_index(['Datetime', 'Level'])

  print(f'Train dataset shape: {train_cached_data.shape} - Test dataset shape: {test_cached_data.shape}')

  roll_shift = roll+1 # rolling period for dyn z score - + 1 from shift in ft.normalize

  train_dyn_prices = normalize(train_cached_data[['Ask_Price', 'Bid_Price']], lob_depth, 'dyn_z_score', roll)
  train_dyn_volumes = normalize(train_cached_data[['Ask_Size', 'Bid_Size']], lob_depth, 'dyn_z_score', roll)
  train_dyn_df = pd.concat([train_dyn_prices, train_dyn_volumes], axis=1).reset_index() # concat along row index
  print(f'Saving {normalized_train_file}')
  train_dyn_df.to_csv(normalized_train_file, compression='gzip') # save normalized data to csv 

  top_ob_train = train_cached_data[train_cached_data.index.get_level_values(1)==0][roll_shift:]
  top_ob_train['Mid_Price'] = (top_ob_train['Ask_Price'] + top_ob_train['Bid_Price']) / 2
  top_ob_train['Spread'] = (top_ob_train['Ask_Price'] - top_ob_train['Bid_Price']) / top_ob_train['Mid_Price']
  top_ob_train['merge_index'] = top_ob_train.reset_index().index.values # useful for merging later
  print(f'Saving {top_ob_train_file}')
  top_ob_train.to_csv(top_ob_train_file, compression='gzip') # save top level not normalized to csv

  # print(f'Saving {normalized_data_folder}/{pair}/TRAIN_top--{norm_type}-{roll}--{input_file_name}')
  # train_dyn_df[train_dyn_df['Level']==0].to_csv(f'{normalized_data_folder}/{pair}/TRAIN_TOP--{norm_type}-{roll}--{input_file_name}', compression='gzip') # save top level to csv 

  test_dyn_prices = normalize(test_cached_data[['Ask_Price', 'Bid_Price']], lob_depth, 'dyn_z_score', roll)
  test_dyn_volumes = normalize(test_cached_data[['Ask_Size', 'Bid_Size']], lob_depth, 'dyn_z_score', roll)
  test_dyn_df = pd.concat([test_dyn_prices, test_dyn_volumes], axis=1).reset_index() # concat along row index
  print(f'Saving {normalized_test_file}')
  test_dyn_df.to_csv(normalized_test_file, compression='gzip') # save normalized data to csv

  top_ob_test = test_cached_data[test_cached_data.index.get_level_values(1)==0][roll_shift:]
  top_ob_test['Mid_Price'] = (top_ob_test['Ask_Price'] + top_ob_test['Bid_Price']) / 2
  top_ob_test['Spread'] = (top_ob_test['Ask_Price'] - top_ob_test['Bid_Price']) / top_ob_test['Mid_Price']
  top_ob_test['merge_index'] = top_ob_test.reset_index().index.values # useful for merging later
  print(f'Saving {top_ob_test_file}')
  top_ob_test.to_csv(top_ob_test_file, compression='gzip') # # save top level not normalized to csv

  # print(f'Saving {normalized_data_folder}/{pair}/TEST_TOP--{norm_type}-{roll}--{input_file_name}')
  # test_dyn_df[test_dyn_df['Level']==0].to_csv(f'{normalized_data_folder}/{pair}/TEST_TOP--{norm_type}-{roll}--{input_file_name}', compression='gzip') # save top level to csv 

display(train_dyn_df.describe()) # check train data overview
display(test_dyn_df.describe()) # check test data overview

# train
train_depth_dyn, train_dt_index_dyn = reshape_lob_levels(train_dyn_df, output_type='array') # 1 train dataset
mid_px_train_dyn = pd.Series((train_depth_dyn[:,2] + train_depth_dyn[:,0]) / 2) # 2
px_ts_train = top_ob_train.reset_index()[['Mid_Price']]

# test
test_depth_dyn, test_dt_index_dyn = reshape_lob_levels(test_dyn_df, output_type='array') # 1 test dataset
mid_px_test_dyn = pd.Series((test_depth_dyn[:,2] + test_depth_dyn[:,0]) / 2) # 2
px_ts_test = top_ob_test.reset_index()[['Mid_Price']]

In [None]:
pd.Series(top_ob_test.index.date).unique().shape

In [None]:
#train_cached_data.shape, top_ob_train.shape

In [None]:
# %%time
# labels = get_labels(top_ob_train.reset_index()['Mid_Price'], int(k_plus), int(k_minus), alpha, long_only=False)
# profit = get_strategy_pnl(top_ob_train.reset_index()['Mid_Price'], labels, trading_fee=0.000712, min_profit=0.0020, plotting=False, return_df=False)

In [None]:
# we need to know opt profit for these labels

## Labels

#### Train Labels

In [None]:
start = 0
end = 30000

In [None]:
# constant to add to avoid negative value (problems with log rets)
# const = -min(mid_px_train_dyn.min(), mid_px_test_dyn.min())  + 0.1
# mid_px_train_dyn_shifted = mid_px_train_dyn.rename('mid_px_dyn')
# mid_px_train_dyn_shifted = mid_px_train_dyn_shifted + const
mid_px_train = px_ts_train['Mid_Price']
# train labels
train_labels_gen = Labels_Generator(mid_px_train)

#step 1
print('\n##### Step 1 #####')
train_labels_gen.get_raw_labels()
label_insights(train_labels_gen.labels)


# step 2 - first cleaning
print('\n##### Step 2 #####')
df_trades2 = train_labels_gen.get_cleaned_labels(fillna_method='ffill', gross_returns=0.005, trade_len=20)
label_insights(train_labels_gen.labels)

# step 3 - second cleaning
print('\n##### Step 3 #####')
df_trades3 = train_labels_gen.get_cleaned_labels(fillna_value=0, gross_returns=0.005, trade_len=30)#, gross_returns=0.002)
label_insights(train_labels_gen.labels)
viz_t.plot_labels_line(mid_px_train[start:end], 
    train_labels_gen.labels[start:end], 
    title='Train Labels', 
    smoothed_signal=train_labels_gen.get_smooth_px()[start:end])

labels_train = train_labels_gen.labels

# get transaction df
strategy_df_train = get_strategy_pnl(mid_px_train, labels_train)

# encode
encoded_train_labels = np_utils.to_categorical(labels_train.values,3) 

In [None]:
viz_t.plot_trades_distribution(df_trades3[df_trades3['cleaned_labels']!=0], bin_size=0.0001, metric='gross_returns', fig_width=900, fig_height=550)

In [None]:
viz_t.plot_trades_length_overview(df_trades3[df_trades3['cleaned_labels']!=0], x='trade_len',  y='gross_returns')

#### Test labels

In [None]:
# constant to add to avoid negative value (problems with log rets)
# mid_px_test_dyn_shifted = mid_px_test_dyn.rename('mid_px_dyn')
# mid_px_test_dyn_shifted = mid_px_test_dyn_shifted + const
mid_px_test = px_ts_test['Mid_Price']
# test labels
test_labels_gen = Labels_Generator(mid_px_test)

#step 1
print('\n##### Step 1 #####')
test_labels_gen.get_raw_labels()
label_insights(test_labels_gen.labels)

# step 2 - first cleaning
print('\n##### Step 2 #####')
df_trades2 = test_labels_gen.get_cleaned_labels(fillna_method='ffill', gross_returns=0.005, trade_len=20)
label_insights(test_labels_gen.labels)

# step 3 - second cleaning
print('\n##### Step 3 #####')
df_trades3 = test_labels_gen.get_cleaned_labels(fillna_value=0, gross_returns=0.005, trade_len=30)#, gross_returns=0.002)
label_insights(test_labels_gen.labels)
viz_t.plot_labels_line(mid_px_test[start:end], 
    test_labels_gen.labels[start:end], 
    title='test Labels', 
    smoothed_signal=test_labels_gen.get_smooth_px()[start:end])

labels_test = test_labels_gen.labels

# get transaction df
strategy_df_test = get_strategy_pnl(mid_px_test, labels_test)

# encode
encoded_test_labels = np_utils.to_categorical(labels_test.values,3) 

In [None]:
df_trades3[df_trades3['cleaned_labels']!=0].head()

In [None]:
viz_t.plot_trades_distribution(df_trades3[df_trades3['cleaned_labels']!=0], bin_size=0.0001, metric='gross_returns', fig_width=900, fig_height=550)

In [None]:
viz_t.plot_trades_length_overview(df_trades3[df_trades3['cleaned_labels']!=0], x='trade_len',  y='gross_returns')

In [None]:
trades_test = strategy_df_test.dropna(subset=['gross_returns'])
trades_test.groupby('labels')['trade_len'].mean()


In [None]:
trades_train = strategy_df_train.dropna(subset=['gross_returns'])
trades_train.groupby('labels')['trade_len'].mean()

In [None]:
fig = px.histogram()
fig.add_trace(go.Histogram(x=trades_train['trade_len'].values, name='train', autobinx = False, xbins={'size':5}))
fig.add_trace(go.Histogram(x=trades_test['trade_len'].values, name='test', autobinx = False, xbins={'size':5}))

# The two histograms are drawn on top of another
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

## Visual check

In [None]:
# SAMPLED MID PX CHART - create a func tool function for this
sample_size = 6 * 5#6*60*24 # daily
dynz_gap = int(roll / sample_size)
hourly_mid_line = make_subplots(specs=[[{"secondary_y": True}]])

y_train = top_ob_train['Mid_Price'].iloc[::sample_size].values
x_train = np.arange(y_train.shape[0])
y_test = top_ob_test['Mid_Price'].iloc[::sample_size].values
x_test = np.arange(y_train.shape[0] + dynz_gap, y_train.shape[0] + y_test.shape[0] + dynz_gap)

y_train_dynz = mid_px_train_dyn.iloc[::sample_size].values  
x_train_dynz = np.arange(y_train.shape[0])
y_test_dynz = mid_px_test_dyn.iloc[::sample_size].values
x_test_dynz = np.arange(y_train.shape[0] + dynz_gap, y_train.shape[0] + y_test.shape[0] + dynz_gap)

hourly_mid_line.add_trace(go.Scatter(y=y_train, x=x_train, name='mid_train'), secondary_y=False)
hourly_mid_line.add_trace(go.Scatter(y=y_test, x=x_test, name='mid_test'), secondary_y=False)
hourly_mid_line.add_trace(go.Scatter(y=y_train_dynz, x=x_train_dynz, name='mid_train_dynz',
    marker=dict(color='rgba(44, 130, 201, 0.3)')), secondary_y=True)
hourly_mid_line.add_trace(go.Scatter(y=y_test_dynz, x=x_test_dynz, name='mid_test_dynz',
    marker=dict(color='rgba(240, 52, 52, 0.3)')), secondary_y=True)

hourly_mid_line.update_yaxes(fixedrange= True, secondary_y=True)

hourly_mid_line.update_layout(title='<b>Sampled mid</b>')
hourly_mid_line.show()

## Model Training & Settings

In [None]:
def create_light_deeplob(T, lob_depth):
    ## big lr, big batch size 16 filter size, shuffle

    input_lmd = Input(shape=(T, lob_depth * 4, 1))
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(input_lmd)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    # conv_first1 = Conv2D(32, (1, 20), padding='same')(conv_first1)
    # conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    # conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    # conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    
    conv_first1 = BatchNormalization()(conv_first1)
    # conv_first1 = Dropout(.5)(conv_first1)
    
    # note on learnable parameters: CONV2(filter shape =1*2, stride=1) layer is: ((shape of width of filter * shape of height filter * number of filters in the previous layer+1) * number of filters) = 2080 or ((2*1*32)+1)*32
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    # conv_first1 = Dropout(.5)(conv_first1)
    # conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    # conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    # conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    # conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = BatchNormalization()(conv_first1)

    conv_first1 = Conv2D(16, (1, lob_depth))(conv_first1)
    conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    # conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    # conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    # conv_first1 = Conv2D(32, (4, 1), padding='same')(conv_first1)
    # conv_first1 = LeakyReLU(alpha=0.01)(conv_first1)
    
    conv_first1 = BatchNormalization()(conv_first1)
    print(conv_first1.shape)
    # conv_first1 = Dropout(.5)(conv_first1)

            
    # # build the inception module
    # convsecond_1 = Conv2D(32, (1, 1), padding='same')(conv_first1)
    # convsecond_1 = LeakyReLU(alpha=0.01)(convsecond_1)
    # convsecond_1 = Conv2D(32, (3, 1), padding='same')(convsecond_1)
    # convsecond_1 = LeakyReLU(alpha=0.01)(convsecond_1)
    # convsecond_1 = BatchNormalization()(convsecond_1)
    # # convsecond_1 = Dropout(.5)(convsecond_1)

    # convsecond_2 = Conv2D(32, (1, 1), padding='same')(conv_first1)
    # convsecond_2 = LeakyReLU(alpha=0.01)(convsecond_2)
    # convsecond_2 = Conv2D(32, (5, 1), padding='same')(convsecond_2)
    # convsecond_2 = LeakyReLU(alpha=0.01)(convsecond_2)
    
    # convsecond_2 = BatchNormalization()(convsecond_2)
    # convsecond_2 = Dropout(.5)(convsecond_2)
    # convsecond_3 = MaxPooling2D((3, 1), strides=(1, 1), padding='same')(conv_first1)
    # convsecond_3 = Conv2D(32, (1, 1), padding='same')(convsecond_3)
    # convsecond_3 = LeakyReLU(alpha=0.01)(convsecond_3)
    # convsecond_3 = BatchNormalization()(convsecond_3)
    # convsecond_3 = Dropout(.5)(convsecond_3)
    
    # convsecond_output = concatenate([convsecond_1, convsecond_2, convsecond_3], axis=3) #, convsecond_3, convsecond_4
    # print(convsecond_output.shape)

    # # use the MC dropout here
    # conv_reshape = Reshape((int(convsecond_output.shape[1])* int(convsecond_output.shape[3]),))(convsecond_output)
    # print(conv_reshape)
    convfirst_output = Reshape((int(conv_first1.shape[1])* int(conv_first1.shape[3]),))(conv_first1)
    print(convfirst_output.shape)
    # note on learnable parameters:FC3 layer is((current layer c*previous layer p)+1*c) with c being number of neurons
    out = Dense(3, activation='softmax')(convfirst_output)
    print(out.shape)
    model = Model(inputs=input_lmd, outputs=out)
    adam = Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

model_code = inspect.getsource(create_light_deeplob)
lines_with_short_desription = [line for line in model_code.split('\n') if "##" in line]
short_description = re.sub(r'\W+', '_', lines_with_short_desription[0])

create_light_deeplob(length, lob_depth).summary()

In [None]:
date_time_now = datetime.now().strftime("%y%m%d-%H%M%S")
experiment_id = f'{date_time_now}-{pair}-{frequency_seconds}s-{lob_depth}l-{length}-{date_start}-{date_end}{short_description}'
experiment_folder = f'{experiments_folder}/{pair}/{experiment_id}'
os.makedirs(f'{experiment_folder}', exist_ok=True)
batch_size=256

config = {
  'pair': pair,
  'frequency': frequency_seconds,
  'lob_depth': lob_depth,
  'length': length,
  'date_start': date_start,
  'date_end': date_end,
  'norm_type': norm_type,
  'roll': roll,
  'k_plus': k_plus,
  'k_minus': k_minus,
  'alpha': alpha,
  'trading_fee': trading_fee,
  'min_profit': min_profit,
  'batch_size': batch_size,
  'input': input_file_name,
  'normalized_train_file': normalized_train_file,
  'normalized_test_file':   normalized_test_file,
  'top_ob_train_file': top_ob_train_file,
  'top_ob_test_file': top_ob_test_file
}

with open(f'{experiment_folder}/config.json', 'w') as fp:
    json.dump(config, fp, default=str)

with open(f'{experiment_folder}/model_code.py', 'w') as fp:
    fp.write(model_code)

light_deeplob = create_light_deeplob(length, lob_depth)
with open(f'{experiment_folder}/model_summary.txt', 'w') as fp:
    light_deeplob.summary(print_fn=lambda x: fp.write(x + '\n'))


In [None]:
# try to train the model on smoother version of the data

## Training

In [None]:
light_deeplob = create_light_deeplob(length, lob_depth)

model_checkpoint_path = f'{experiment_folder}/{experiment_id}.h5'

# Learning rate callback. Reduce on Plateau multiply the lr by the factor if val loss does not improve for n epochs (patience)
lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                   factor=0.2, 
                                                   patience=20)

# Checkpoint callback. Saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(model_checkpoint_path,
                                                 save_best_only=True,
                                                 save_weights_only=False,
                                                 verbose=2,
                                                 save_freq='epoch') # every epoch

# Early stopping callback. When sees no progress on the validation set
es_callback = tf.keras.callbacks.EarlyStopping(patience=50,
                                               restore_best_weights=True)

# Tensorboard callback
tb_callback = tf.keras.callbacks.TensorBoard(experiment_folder)

# Train and Test time series generators
generator_train = TimeseriesGenerator(
    train_depth_dyn,
    encoded_train_labels,
    length,
    batch_size=batch_size,
    shuffle=True
)

# to be replaced with validation?
generator_test = TimeseriesGenerator(
    test_depth_dyn,
    encoded_test_labels,
    length,
    batch_size=batch_size,
    shuffle=True
)


# This may generate warnings related to saving the state of the optimizer.
# These warnings (and similar warnings throughout this notebook)
# are in place to discourage outdated usage, and can be ignored.

In [None]:
# model_name = '/home/federico/Python_vsc_dir/RL_Trader/Experiments/USDT_BTC/210119-184504-USDT_BTC-10s-10l-300-2020_04_04-2021_01_03_binary_classification_full_inception_lighter_deep_lob_model_with_longer_timesteps_300_/210119-184504-USDT_BTC-10s-10l-300-2020_04_04-2021_01_03_binary_classification_full_inception_lighter_deep_lob_model_with_longer_timesteps_300_.h5'
# loaded_light_deep_lob = tf.keras.models.load_model(model_name)


In [None]:
# Train the model
light_deeplob.fit(generator_train, 
            epochs=200, 
            verbose=0,
            validation_data=generator_test,
            callbacks=[lr_callback, cp_callback, es_callback, tb_callback])

### Model results

In [None]:
model_name = '/home/federico/Python_vsc_dir/RL_Trader/Experiments/USDT_BTC/210221-200759-USDT_BTC-10s-10l-100-2020_04_04-2021_01_03_big_lr_big_batch_size_16_filter_size_shuffle/210221-200759-USDT_BTC-10s-10l-100-2020_04_04-2021_01_03_big_lr_big_batch_size_16_filter_size_shuffle.h5'

In [None]:
# Load the previously saved weights and evaluate model performance
deep_lob_loaded = tf.keras.models.load_model(model_name)
generator_test = TimeseriesGenerator(
    test_depth_dyn,
    encoded_test_labels,
    length,
    batch_size=batch_size,
    shuffle=False
)

def evaluate_model(model):
    # Re-evaluate the model
    loss, acc = model.evaluate(generator_test, verbose=2)
    print("Restored model, accuracy: {:5.2f}%".format(100*acc))

In [None]:
#evaluate_model(deep_lob_loaded)

In [None]:
# Get predicted labels
predictions_prob = deep_lob_loaded.predict(generator_test, verbose=1)

In [None]:
predictions_prob_wa = pd.DataFrame(predictions_prob).rolling(window=10).mean().values
map_labels = np.vectorize(back_to_labels) # vectorize back to labels from func_tools
predicted_labels_wa = pd.Series(map_labels(np.argmax(predictions_prob_wa,axis=1)), name='predicted_labels_wa') # back to original 1,0,-1
predicted_labels = pd.Series(map_labels(np.argmax(predictions_prob,axis=1)), name='predicted_labels')

In [None]:
print('##### Predicted labels #####')
label_insights(predicted_labels)


In [None]:
print('##### Weighted average predicted labels #####')
label_insights(predicted_labels_wa)

In [None]:
predicted_labels.shape, test_depth_dyn.shape

In [None]:
# dangerous assigning offset here, wrap it into a function
offset=100 # offset for plotting
start=0
end=10000
# align prediction offset
index_range = np.arange(offset, predicted_labels.shape[0] + offset)
predicted_labels.index = index_range
buy_prob = pd.Series(predictions_prob[:,1], index=index_range)
sell_prob = pd.Series(predictions_prob[:,2], index=index_range)
zero_prob = pd.Series(predictions_prob[:,0], index=index_range)

buy_prob_wa = pd.Series(predictions_prob_wa[:,1], index=index_range)

plot_labels_line(top_ob_test['Mid_Price'][start:end], 
    test_labels_gen.labels[start:end], 
    title='Train Labels', 
    #smoothed_signal=test_labels_gen.get_smooth_px()[start:end],
    predicted_labels=predicted_labels[start:end],
    buy_prob_labels=buy_prob[start:end],
    predictions_prob_wa=buy_prob_wa[start:end],
    #sell_prob_labels=sell_prob[start:end],
    #dun_px_label=(mid_px_test_dyn_shifted[start:end] - mid_px_test_dyn_shifted.mean())/mid_px_test_dyn_shifted.std()
    )

In [None]:
# fig = go.Figure(data=go.Scatter(x=buy_prob.index, y=buy_prob.values))
# fig.show()

In [None]:
top_ob_test

In [None]:
ret_ts, vol_ts = intraday_vol_ret(mid, span=100)

In [None]:
vol_ts[10000:55000].plot()

In [None]:
ret_ts[10000:55000].plot()

In [None]:
top_ob_test

In [None]:
# top_ob_test.index = pd.to_datetime(top_ob_test['Datetime'])

# mid = top_ob_test['Mid_Price']
# mid = mid[:100000]
# smooth_mid = Labels_Generator(mid).get_smooth_px()

# smooth_mid.index = output.index
# smooth_mid.name = 'Smoothed_mid'

# import labelling_class
# labelling_class.three_barrier_labelling(smooth_mid, h=700, factor=[1.0020, 0.9980])


In [None]:
px_ts = top_ob_test['Mid_Price'][100:].reset_index()['Mid_Price']# adjust prediction offsset
datetime_ts = top_ob_test['Datetime'][100:].reset_index()['Datetime']
trades_timeseries = get_strategy_pnl(px_ts, predicted_labels)
df_trades = trades_timeseries.dropna(subset=['gross_returns'])

In [None]:
datetime_ts

In [None]:
pd.merge(px_ts, predicted_labels, left_index=True, right_index=True)

In [None]:
buy_prob = pd.Series(predictions_prob[:,1], name='buy_prob')
sell_prob = pd.Series(predictions_prob[:,2], name='sell_prob')
zero_prob = pd.Series(predictions_prob[:,0], name='zero_prob')

In [None]:
px_ts#top_ob_test[100:]

In [None]:
### to do:
# need a sliding window to calculate rolling volatity - not sure about using rolling
# seek for patterns in prediction probability
# day vs night - weekday vs weekend - model certainty before long trades vs short trades
# plot original labels and compare visually (could be part of db)
# determine if predictions are naive

In [None]:
# expand with other components of the order book
timeseries_results = pd.concat([datetime_ts, trades_timeseries, buy_prob, sell_prob, zero_prob], axis=1)
timeseries_results['10min_std'] = timeseries_results['log_ret'].rolling(6*10).std()
timeseries_results['1hr_std'] = timeseries_results['log_ret'].rolling(6*60).std()
timeseries_results['1d_std'] = timeseries_results['log_ret'].rolling(6*60*24).std()
# # np.std(top_ob['log_rets'])
# ten_s_std = np.sqrt(np.sum((timeseries_results['log_ret'] - timeseries_results['log_ret'].mean())**2)/(timeseries_results['log_ret'].shape[0]-1)) # -1 unbiased estimator
# one_h_std = ten_s_std * np.sqrt(6*60) # assuming statistic independence of returns

In [None]:
timeseries_results['log_ret'][:300000].plot()

In [None]:
timeseries_results.head()

In [None]:
%%time
fig = make_subplots(specs=[[{"secondary_y": True}]])


fig.add_trace(
    go.Scatter(
            x = timeseries_results['Datetime'],
            y = timeseries_results['1hr_std']
    ),
    secondary_y=False
)

fig.add_trace(
    go.Scatter(
            x = timeseries_results['Datetime'],
            y = timeseries_results['1d_std']
    ),
    secondary_y=False
)

#fig.update_layout(showlegend=False)

fig.show()

In [None]:
np.sqrt(np.sum((timeseries_results['log_ret'] - timeseries_results['log_ret'].mean())**2)/(timeseries_results['log_ret'].shape[0]-1))

In [None]:
trades_timeseries

In [None]:
buy_prob

In [None]:
plot_trades_distribution(df_trades, bin_size=0.0001, metric='gross_returns')

In [None]:
plot_trades_length_overview(df_trades, x='trade_len',  y='gross_returns')