In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast

In [52]:
# insert date name
name = '2024-06-24'

lob = pd.read_table(f"UoB_Set01_{name}LOBs.txt")
# tape = pd.read_csv("UoB_Set01_2024-06-24tapes.csv",
                 # names=['Timestamp', 'Price', 'Quantity'])

In [53]:
lob.isnull().sum()

[0.000, Exch0, [['bid', []], ['ask', []]]]    0
dtype: int64

There is no missing data in this LOB (24/06/2024).

First, we extract the timestamp information using regex...

In [54]:
pattern = r'(\d+.+?),'
timestamps = lob.iloc[:,0].str.extract(pattern)
lob['Timestamps'] = timestamps

We then extract the ask data in a similar way. However, we need to use two regex statements and then converge them to one list, to account for the cases where there is only one price in the ask section.

In [55]:
pattern1 = r"'ask', (\[\[(?:\d+,\s*\d+)\]\])"
pattern2 = r"'ask', (\[(\[(?:\d+,\s*)+\d+\], )+\[(?:\d+,\s*)+\d+\]\])"
asks1 = lob.iloc[:,0].str.findall(pattern1)
asks2 = lob.iloc[:,0].str.findall(pattern2)

In [56]:
ask_list = []
for ask in asks2:
    if len(ask) == 0:
        ask_list.append('[]')
    else:
        ask_list.append(ask[0][0])

In [57]:
for i in range(len(ask_list)):
    if ask_list[i] == '[]':
        if asks1[i] != []:
            ask_list[i] = asks1[i][0]

In [58]:
lob['Asks'] = ask_list

We do the same for the bid data...

In [59]:
pattern1 = r"'bid', (\[\[(?:\d+,\s*\d+)\]\])"
pattern2 = r"'bid', (\[(\[(?:\d+,\s*)+\d+\], )+\[(?:\d+,\s*)+\d+\]\])"
bids1 = lob.iloc[:,0].str.findall(pattern1)
bids2 = lob.iloc[:,0].str.findall(pattern2)

In [60]:
bid_list = []
for bid in bids2:
    if len(bid) == 0:
        bid_list.append('[]')
    else:
        bid_list.append(bid[0][0])

In [61]:
for i in range(len(bid_list)):
    if bid_list[i] == '[]':
        if bids1[i] != []:
            bid_list[i] = bids1[i][0]

In [62]:
lob['Bids'] = bid_list

We get rid of the original column so that we are left with just our initial column. We also add in the original row (imported as a header initially).

In [63]:
lob.iloc[200:205,:]

Unnamed: 0,"[0.000, Exch0, [['bid', []], ['ask', []]]]",Timestamps,Asks,Bids
200,"[26.102, Exch0, [['bid', [[96, 14], [95, 9], [...",26.102,"[[104, 2], [105, 1], [109, 3], [245, 3], [649,...","[[96, 14], [95, 9], [92, 1], [87, 5], [79, 2]]"
201,"[26.288, Exch0, [['bid', [[96, 14], [95, 9], [...",26.288,"[[104, 2], [105, 1], [109, 3], [547, 3], [649,...","[[96, 14], [95, 9], [92, 1], [87, 5], [79, 2]]"
202,"[26.319, Exch0, [['bid', [[96, 14], [95, 9], [...",26.319,"[[104, 2], [105, 1], [109, 3], [547, 3], [649,...","[[96, 14], [95, 9], [92, 1], [79, 2], [12, 5]]"
203,"[26.381, Exch0, [['bid', [[96, 14], [95, 9], [...",26.381,"[[104, 2], [105, 1], [109, 3], [173, 3], [649,...","[[96, 14], [95, 9], [92, 1], [79, 2], [12, 5]]"
204,"[26.412, Exch0, [['bid', [[96, 14], [95, 9], [...",26.412,"[[104, 2], [105, 1], [109, 3], [173, 3], [457,...","[[96, 14], [95, 9], [92, 1], [79, 2], [12, 5]]"


In [64]:
lob = lob.drop(columns="[0.000, Exch0, [['bid', []], ['ask', []]]]")
new_row = pd.DataFrame({'Timestamps':'0.000', 'Bids':'[]', 'Asks':'[]'}, index=[0])
lob = pd.concat([new_row,lob.loc[:]]).reset_index(drop=True)

Now, we export the data to a cleaned csv file for re-import to an EDA procedure.

In [65]:
lob.to_csv(f'cleaned_lob_data_{name}.csv')