In [4]:
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import dates as mdate
import json, datetime, pandas
pandas.options.display.float_format = "{:.2f}".format
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import ipywidgets as widgets 
from IPython.display import display, clear_output, Markdown

In [35]:
bbt = pandas.read_csv('20210420_bbt.log.csv',sep=';')
aba = pandas.read_csv('20210420_aba.log.csv',sep=';')
merged = pandas.merge(left=aba, right=bbt[['symbol','time','price', 'qty']], how='inner', left_on=['symbol','timestamp'], right_on=['symbol','time'])
merged.drop(["time"], axis = 1, inplace = True)
merged.head()

Unnamed: 0,timestamp,symbol,spread,book_imbalance,best_bid_p,best_ask_p,pressure_ask,pressure_bid,wt_bid_p,wt_ask_p,wt_mid_p,wt_p,mid_p,price,qty
0,1618929363,VALE3,0.03,0.4,106.04,106.07,0.3,0.7,105.93,106.14,106.06,105.99,106.06,106.04,400
1,1618929363,VALE3,0.03,0.4,106.04,106.07,0.3,0.7,105.93,106.14,106.06,105.99,106.06,106.04,200
2,1618929363,VALE3,0.03,0.4,106.04,106.07,0.3,0.7,105.93,106.14,106.06,105.99,106.06,106.04,400
3,1618929363,VALE3,0.03,0.4,106.04,106.07,0.3,0.7,105.93,106.14,106.06,105.99,106.06,106.04,200
4,1618929363,VALE3,0.01,0.43,106.04,106.05,0.29,0.71,105.93,106.14,106.05,105.99,106.05,106.04,400


In [45]:
class LOB:
    # variables can be access externally
    DATA={}
    TRADES = {}
    FILTERED_DATA = {}
    def __init__(self, symbols, csv_file, csv_trades=None):
        self.symbols = symbols
        self.DATA = pandas.read_csv(csv_file,sep=';')
        if csv_trades is not None:
            self.TRADES = pandas.read_csv(csv_trades,sep=';')
            self.DATA = pandas.merge(left=self.DATA, right=self.TRADES[['symbol','time','price', 'qty']], how='inner', left_on=['symbol','timestamp'], right_on=['symbol','time'])
            self.DATA.drop(["time"], axis = 1, inplace = True)
                
        self.DATA.set_index('timestamp', inplace=True)
    def lob_stats_analyser(self):
        outt = widgets.Output()
        dropdown_symbol = widgets.Dropdown(options=self.symbols, description='Symbols:')
        columns = [col for col in self.DATA.columns]+ ['price', 'qty']
        columns.remove('symbol')
        stats_checkbox = [widgets.Checkbox(description=stats, value=True,indent=False, disabled=False) for stats in columns]
        #stats_checkbox = stats_checkbox + [widgets.Checkbox(description=stats, value=True,indent=False, disabled=False) for stats in self.TRADES.columns]
        stats_checkboxes = widgets.HBox(stats_checkbox)
        rolling = widgets.FloatSlider(
                 value=50,
                 min=1,
                 max=200,
                 step=1,
                 disabled=False,
                 description='Rolling:',
                )
        timerange = widgets.IntRangeSlider(
                        value=[0,0],
                        min=0,
                        max=0,
                        step=5,
                        description='Range:',
                        disabled=False,
                        continuous_update=False,
                        orientation='horizontal',
                        readout=True,
                        readout_format='d',
                        layout=dict(width='95%')
                        )
        timerange.max = self.DATA.index.max()
        timerange.min = self.DATA.index.min()
        timerange.value = [self.DATA.index.min(), self.DATA.index.max()]

        def plot_time_range(start, end, normalized):
            with outt:
                clear_output()
                timerange.value = [start, end]
                print("Start time:", datetime.datetime.fromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S'),
                        "  End time:", datetime.datetime.fromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S'))
                selected_stats = [checkbox.description for checkbox in stats_checkbox if checkbox.value]
        
                print(selected_stats)
                self.FILTERED_DATA = self.DATA[(self.DATA.symbol == dropdown_symbol.value) & (self.DATA.index > start) & (self.DATA.index < end)][selected_stats].rolling(int(rolling.value)).mean().dropna()
                if normalized:
                    self.FILTERED_DATA = self.FILTERED_DATA.apply(lambda x: (x-x.min())/(x.max()-x.min()))
                plt.rcParams['figure.figsize'] = [15,7]
                plt.rcParams['timezone'] = 'America/Sao_Paulo'

                fig,ax = plt.subplots()
                x = mdate.epoch2num(self.FILTERED_DATA.index)
                fmt = mdate.DateFormatter('%H:%M:%S')

                ax.xaxis.set_major_formatter(fmt)
                plt.xticks( rotation=25 )
                plt.plot_date(x, self.FILTERED_DATA, linestyle='solid', marker='None')

                plt.legend(self.FILTERED_DATA.columns)
                plt.show()
        def on_btn_filter_range_clicked(b):
            plot_time_range(timerange.value[0], timerange.value[1], False)
        def on_btn_filter_range_norm_clicked(b):
            plot_time_range(timerange.value[0], timerange.value[1], True)
        btn_filter_range = widgets.Button(description='Filter',button_style='danger', disabled=False)
        btn_filter_range.on_click(on_btn_filter_range_clicked)
        btn_filter_range_norm = widgets.Button(description='Filter w/ Norm',button_style='warning', disabled=False)
        btn_filter_range_norm.on_click(on_btn_filter_range_norm_clicked)
        btns = widgets.HBox([dropdown_symbol,btn_filter_range, btn_filter_range_norm])

        return widgets.VBox([stats_checkboxes, rolling, timerange,btns, outt])

In [46]:
symbols = ["VALE3","PETR4", "BPAC11", "CEAB3", "ABEV3", "B3SA3", "EMBR3", "GGBR4", "TOTS3"]
#symbols = ["vale3"]
file = '20210420_aba.log.csv'
trades = '20210420_bbt.log.csv'
lob = LOB(symbols, file, trades)
lob.lob_stats_analyser()

VBox(children=(HBox(children=(Checkbox(value=True, description='spread', indent=False), Checkbox(value=True, d…

In [44]:
lob.DATA.rolling(int(50)).mean().dropna().head()

Unnamed: 0_level_0,spread,book_imbalance,best_bid_p,best_ask_p,pressure_ask,pressure_bid,wt_bid_p,wt_ask_p,wt_mid_p,wt_p,mid_p,price,qty
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1618929370,0.02,0.42,91.27,91.29,0.29,0.71,91.17,91.36,91.27,91.23,91.28,91.28,172.0
1618929370,0.02,0.42,91.27,91.29,0.29,0.71,91.17,91.36,91.27,91.23,91.28,91.28,166.0
1618929370,0.02,0.42,91.27,91.28,0.29,0.71,91.17,91.36,91.27,91.23,91.28,91.28,168.0
1618929370,0.02,0.42,91.27,91.28,0.29,0.71,91.17,91.35,91.27,91.23,91.28,91.28,176.0
1618929370,0.02,0.42,91.27,91.28,0.29,0.71,91.17,91.35,91.27,91.23,91.27,91.28,174.0


In [7]:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corr.html
lob.FILTERED_DATA.corr()

Unnamed: 0,book_imbalance,price
book_imbalance,1.0,-0.72
price,-0.72,1.0


In [8]:
def df_derived_by_shift(df,lag=0,NON_DER=[]):
    df = df.copy()
    if not lag:
        return df
    cols ={}
    for i in range(1,lag+1):
        for x in list(df.columns):
            if x not in NON_DER:
                if not x in cols:
                    cols[x] = ['{}_{}'.format(x, i)]
                else:
                    cols[x].append('{}_{}'.format(x, i))
    for k,v in cols.items():
        columns = v
        dfn = pandas.DataFrame(data=None, columns=columns, index=df.index)    
        i = 1
        for c in columns:
            dfn[c] = df[k].shift(periods=i)
            i+=1
        df = pandas.concat([df, dfn], axis=1, join_axes=[df.index])
    return df
df_new = df_derived_by_shift(lob.FILTERED_DATA, 10).dropna()
CORR = df_new.corr()



In [9]:
#CORR['book_imbalance']['book_imbalance']
pandas.set_option('display.max_rows', 1000) 
pandas.set_option('display.max_columns', 1000) 

#CORR.book_imbalance
CORR

Unnamed: 0,book_imbalance,price,book_imbalance_1,book_imbalance_2,book_imbalance_3,book_imbalance_4,book_imbalance_5,book_imbalance_6,book_imbalance_7,book_imbalance_8,book_imbalance_9,book_imbalance_10,price_1,price_2,price_3,price_4,price_5,price_6,price_7,price_8,price_9,price_10
book_imbalance,1.0,-0.72,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.99,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72
price,-0.72,1.0,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
book_imbalance_1,1.0,-0.72,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72
book_imbalance_2,1.0,-0.72,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72
book_imbalance_3,1.0,-0.72,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72
book_imbalance_4,1.0,-0.72,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72
book_imbalance_5,1.0,-0.72,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72
book_imbalance_6,1.0,-0.72,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72
book_imbalance_7,0.99,-0.72,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72
book_imbalance_8,0.99,-0.72,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72,-0.72


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
colormap = plt.cm.RdBu
plt.figure(figsize=(20,15))
#plt.title(u'6 hours', y=1.05, size=16)

mask = np.zeros_like(df_new.corr())
mask[np.triu_indices_from(mask)] = True

svm = sns.heatmap(df_new.corr(), mask=mask, linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)