In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
pd.set_option('display.max_rows', 500)

In [13]:
dates = ['2024-05-28', '2024-05-29', '2024-05-30', '2024-05-31']
cleaned = []

In [14]:
class Cleaner:
    
    def __init__(self, date):
        self.lob = pd.read_table(f"./data/neural_network_test_data/UoB_Set01_{date}LOBs.txt")
        
    def extract_info(self):
        
        # extract timestamps
        pattern = r'(\d+.+?),'
        timestamps = self.lob.iloc[:,0].str.extract(pattern)
        self.lob['Timestamps'] = timestamps
        
        # extract asks
        pattern1 = r"'ask', (\[\[(?:\d+,\s*\d+)\]\])"
        pattern2 = r"'ask', (\[(\[(?:\d+,\s*)+\d+\], )+\[(?:\d+,\s*)+\d+\]\])"
        asks1 = self.lob.iloc[:,0].str.findall(pattern1)
        asks2 = self.lob.iloc[:,0].str.findall(pattern2)
        ask_list = []
        for ask in asks2:
            if len(ask) == 0:
                ask_list.append('[]')
            else:
                ask_list.append(ask[0][0])
        for i in range(len(ask_list)):
            if ask_list[i] == '[]':
                if asks1[i] != []:
                    ask_list[i] = asks1[i][0]
        self.lob['Asks'] = ask_list
        
        # extract bids
        pattern1 = r"'bid', (\[\[(?:\d+,\s*\d+)\]\])"
        pattern2 = r"'bid', (\[(\[(?:\d+,\s*)+\d+\], )+\[(?:\d+,\s*)+\d+\]\])"
        bids1 = self.lob.iloc[:,0].str.findall(pattern1)
        bids2 = self.lob.iloc[:,0].str.findall(pattern2)
        bid_list = []
        for bid in bids2:
            if len(bid) == 0:
                bid_list.append('[]')
            else:
                bid_list.append(bid[0][0])
        for i in range(len(bid_list)):
            if bid_list[i] == '[]':
                if bids1[i] != []:
                    bid_list[i] = bids1[i][0]
        self.lob['Bids'] = bid_list
        
        # cleaning
        self.lob = self.lob.drop(columns="[0.000, Exch0, [['bid', []], ['ask', []]]]")
        new_row = pd.DataFrame({'Timestamps':'0.000', 'Bids':'[]', 'Asks':'[]'}, index=[0])
        self.lob = pd.concat([new_row,self.lob.loc[:]]).reset_index(drop=True)
        
        return self.lob

In [15]:
class apply_functions:
    
    def ba(self, x):
        try:
            return x[0][0]
        except:
            return np.nan
        
    def vol_ba(self, x):
        try:
            return x[0][1]
        except:
            return 0
    
    def dW(self, x, df):
        if df.b[x.index.start] == df.b[x.index.stop-1]:
            return x.iloc[-1] - x.iloc[0]
        if df.b[x.index.stop-1] > df.b[x.index.start]:
            return x.iloc[-1]
        if df.b[x.index.stop-1] < df.b[x.index.start]:
            return -x.iloc[0]
        else:
            return np.NaN

    def dV(self, x, df):
        if df.a[x.index.start] == df.a[x.index.stop-1]:
            return x.iloc[-1] - x.iloc[0]
        if df.a[x.index.stop-1] > df.a[x.index.start]:
            return -x.iloc[0]
        if df.a[x.index.stop-1] < df.a[x.index.start]:
            return x.iloc[-1]
        else:
            return np.NaN
        
    def count_vol(self, x):
        count = 0
        for entry in x:
            count += entry[1]
        return count
    
    def get_signal(self, x, df):
        if df.b[x.index[0]] >= df.a[x.index[-1]] + 2:  # margin of safety = 2 here
            return 2
        if df.a[x.index[0]] <= df.b[x.index[-1]] - 2:  # margin of safety = 2 here also
            return 0
        else:
            return 1

In [16]:
class Processor(apply_functions):
    
    def __init__(self, lob):
        self.lob = lob
        self.lob_r = None
        
    def make_arrays(self):
        self.lob.Bids = self.lob.Bids.apply(lambda x: ast.literal_eval(str(x)))
        self.lob.Asks = self.lob.Asks.apply(lambda x: ast.literal_eval(str(x)))
        self.lob.Timestamps = self.lob.Timestamps.apply(lambda x: float(x))
        self.lob.Bids = self.lob.Bids.apply(lambda x: np.array(x))
        self.lob.Asks = self.lob.Asks.apply(lambda x: np.array(x))
        
    def add_b_a(self):
        self.lob['b'] = self.lob.Bids.apply(self.ba)
        self.lob['a'] = self.lob.Asks.apply(self.ba)
        self.lob['s'] = self.lob['a'].subtract(self.lob.b)
        
    def add_m(self):
        self.lob['m'] = self.lob['b'] + ((self.lob['a'] - self.lob['b']) / 2)
    
    def make_index_delta(self):
        td = pd.to_timedelta(self.lob['Timestamps'],'s')
        self.lob = self.lob.set_index(td)
    
    def get_vol_at_a_b(self):
        self.lob['vol_b'] = self.lob.Bids.apply(self.vol_ba)
        self.lob['vol_a'] = self.lob.Asks.apply(self.vol_ba)
    
    def add_OFI_req(self):
        self.lob = self.lob.reset_index(drop=True)
        self.lob['dW'] = self.lob.vol_b.rolling(window=2).apply(lambda x: self.dW(x, self.lob))
        self.lob['dV'] = self.lob.vol_a.rolling(window=2).apply(lambda x: self.dV(x, self.lob))
        self.lob['e'] = self.lob['dW'] - self.lob['dV']
    
    def add_OFI(self, interval):
        td = pd.to_timedelta(self.lob['Timestamps'],'s')
        self.lob = self.lob.set_index(td)
        self.lob.loc[:, f'OFI{interval}'] = self.lob['e'].rolling(interval).sum()
    
    def add_volume_metrics(self):
        self.lob['all_vol_b'] = self.lob.Bids.apply(lambda x: self.count_vol(x))
        self.lob['all_vol_a'] = self.lob.Asks.apply(lambda x: self.count_vol(x))
        self.lob['all_vol_diff'] = self.lob['all_vol_b'] - self.lob['all_vol_a']
        self.lob['OBI'] = (self.lob['vol_b'] - self.lob['vol_a']) / (self.lob['vol_b'] + self.lob['vol_a'])
    
    def add_signal(self, interval):
        time = interval
        self.lob_r = self.lob[::-1]
        self.lob_r[f'signal_{time}'] = self.lob_r.b.rolling(time).apply(lambda x: self.get_signal(x, self.lob_r))
        self.lob = self.lob.join(self.lob_r[f'signal_{interval}'])

In [17]:
for date in dates:
    x = Cleaner(date)
    cleaned.append(x.extract_info())

In [18]:
len(cleaned)

4

In [19]:
processed = []

In [20]:
for i, df in enumerate(cleaned):
    x = Processor(df)
    x.make_arrays()
    x.add_b_a()
    x.add_m()
    x.make_index_delta()
    x.get_vol_at_a_b()
    x.add_OFI_req()
    x.add_volume_metrics()
    for interval in ['10s']:  # ['5s', '10s', '20s', '30s', '45s', '60s']
        x.add_OFI(interval)
    for interval in ['10s']:  # ['5s', '10s', '20s', '30s', '45s', '60s']
        x.add_signal(interval)
    processed.append(x.lob)
    print('finished', i)
    display(x.lob)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.lob_r[f'signal_{time}'] = self.lob_r.b.rolling(time).apply(lambda x: self.get_signal(x, self.lob_r))


finished 0


Unnamed: 0_level_0,Timestamps,Bids,Asks,b,a,s,m,vol_b,vol_a,dW,dV,e,all_vol_b,all_vol_a,all_vol_diff,OBI,OFI10s,signal_10s
Timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0 days 00:00:00,0.000,[],[],,,,,0,0,,,,0,0,0,,,1.0
0 days 00:00:02.976000,2.976,"[[77, 2]]",[],77.0,,,,2,0,,,,2,0,2,1.000000,,1.0
0 days 00:00:03.286000,3.286,"[[30, 2]]",[],30.0,,,,2,0,-2.0,,,2,0,2,1.000000,,1.0
0 days 00:00:03.348000,3.348,"[[52, 2]]",[],52.0,,,,2,0,2.0,,,2,0,2,1.000000,,1.0
0 days 00:00:03.410000,3.410,"[[38, 2]]",[],38.0,,,,2,0,-2.0,,,2,0,2,1.000000,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0 days 08:29:59.387000,30599.387,"[[104, 2], [71, 4], [43, 4], [19, 7], [18, 3]]","[[113, 1], [114, 2], [369, 4], [404, 2], [538,...",104.0,113.0,9.0,108.5,2,1,0.0,0.0,0.0,20,15,5,0.333333,-63.0,1.0
0 days 08:29:59.449000,30599.449,"[[104, 2], [71, 4], [43, 4], [19, 7], [18, 3]]","[[112, 2], [113, 1], [369, 4], [404, 2], [538,...",104.0,112.0,8.0,108.0,2,2,0.0,2.0,-2.0,20,15,5,0.000000,-65.0,1.0
0 days 08:29:59.542000,30599.542,"[[104, 2], [71, 4], [43, 4], [19, 7], [18, 3]]","[[111, 4], [112, 2], [113, 1], [369, 4], [404,...",104.0,111.0,7.0,107.5,2,4,0.0,4.0,-4.0,20,19,1,-0.333333,-69.0,1.0
0 days 08:29:59.666000,30599.666,"[[71, 4], [43, 4], [19, 7], [18, 3]]","[[111, 4], [112, 2], [113, 1], [369, 4], [404,...",71.0,111.0,40.0,91.0,4,4,-2.0,0.0,-2.0,18,19,-1,0.000000,-71.0,1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.lob_r[f'signal_{time}'] = self.lob_r.b.rolling(time).apply(lambda x: self.get_signal(x, self.lob_r))


finished 1


Unnamed: 0_level_0,Timestamps,Bids,Asks,b,a,s,m,vol_b,vol_a,dW,dV,e,all_vol_b,all_vol_a,all_vol_diff,OBI,OFI10s,signal_10s
Timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0 days 00:00:00,0.000,[],[],,,,,0,0,,,,0,0,0,,,1.0
0 days 00:00:00.248000,0.248,"[[1, 1]]",[],1.0,,,,1,0,,,,1,0,1,1.000000,,1.0
0 days 00:00:00.434000,0.434,"[[2, 1]]",[],2.0,,,,1,0,1.0,,,1,0,1,1.000000,,1.0
0 days 00:00:00.589000,0.589,"[[3, 1]]",[],3.0,,,,1,0,1.0,,,1,0,1,1.000000,,1.0
0 days 00:00:00.837000,0.837,"[[4, 1], [3, 1]]",[],4.0,,,,1,0,1.0,,,2,0,2,1.000000,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0 days 08:29:59.666000,30599.666,"[[112, 1], [111, 1], [109, 5], [108, 8], [106,...","[[127, 5], [128, 4], [129, 5], [131, 5], [331,...",112.0,127.0,15.0,119.5,1,5,0.0,0.0,0.0,38,30,8,-0.666667,-106.0,1.0
0 days 08:29:59.728000,30599.728,"[[112, 1], [111, 1], [108, 8], [106, 1], [105,...","[[127, 5], [128, 4], [129, 5], [131, 5], [331,...",112.0,127.0,15.0,119.5,1,5,0.0,0.0,0.0,33,30,3,-0.666667,-106.0,1.0
0 days 08:29:59.790000,30599.790,"[[114, 1], [112, 1], [111, 1], [108, 8], [106,...","[[127, 5], [128, 4], [129, 5], [131, 5], [331,...",114.0,127.0,13.0,120.5,1,5,1.0,0.0,1.0,34,30,4,-0.666667,-105.0,1.0
0 days 08:29:59.821000,30599.821,"[[114, 1], [112, 1], [111, 1], [108, 8], [106,...","[[126, 5], [128, 4], [129, 5], [131, 5], [331,...",114.0,126.0,12.0,120.0,1,5,0.0,5.0,-5.0,34,30,4,-0.666667,-110.0,1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.lob_r[f'signal_{time}'] = self.lob_r.b.rolling(time).apply(lambda x: self.get_signal(x, self.lob_r))


finished 2


Unnamed: 0_level_0,Timestamps,Bids,Asks,b,a,s,m,vol_b,vol_a,dW,dV,e,all_vol_b,all_vol_a,all_vol_diff,OBI,OFI10s,signal_10s
Timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0 days 00:00:00,0.000,[],[],,,,,0,0,,,,0,0,0,,,1.0
0 days 00:00:01.116000,1.116,"[[28, 6]]",[],28.0,,,,6,0,,,,6,0,6,1.000000,,1.0
0 days 00:00:01.550000,1.550,"[[28, 6]]","[[800, 1]]",28.0,800.0,772.0,414.0,6,1,0.0,,,6,1,5,0.714286,,1.0
0 days 00:00:01.736000,1.736,"[[38, 6]]","[[800, 1]]",38.0,800.0,762.0,419.0,6,1,6.0,0.0,6.0,6,1,5,0.714286,6.0,1.0
0 days 00:00:02.046000,2.046,"[[38, 6]]","[[799, 1]]",38.0,799.0,761.0,418.5,6,1,0.0,1.0,-1.0,6,1,5,0.714286,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0 days 08:29:59.511000,30599.511,"[[112, 4], [111, 5], [108, 10], [105, 1], [104...","[[144, 5], [145, 5], [147, 3], [151, 2], [152,...",112.0,144.0,32.0,128.0,4,5,0.0,0.0,0.0,33,22,11,-0.111111,-94.0,1.0
0 days 08:29:59.542000,30599.542,"[[112, 4], [111, 5], [108, 10], [105, 1], [104...","[[143, 5], [145, 5], [147, 3], [151, 2], [152,...",112.0,143.0,31.0,127.5,4,5,0.0,5.0,-5.0,33,22,11,-0.111111,-99.0,1.0
0 days 08:29:59.697000,30599.697,"[[112, 4], [111, 5], [108, 10], [105, 1], [104...","[[143, 5], [145, 5], [147, 3], [151, 2], [152,...",112.0,143.0,31.0,127.5,4,5,0.0,0.0,0.0,33,22,11,-0.111111,-99.0,1.0
0 days 08:29:59.728000,30599.728,"[[112, 4], [111, 5], [108, 10], [105, 1], [104...","[[143, 5], [145, 5], [147, 3], [151, 2], [152,...",112.0,143.0,31.0,127.5,4,5,0.0,0.0,0.0,33,22,11,-0.111111,-104.0,1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.lob_r[f'signal_{time}'] = self.lob_r.b.rolling(time).apply(lambda x: self.get_signal(x, self.lob_r))


finished 3


Unnamed: 0_level_0,Timestamps,Bids,Asks,b,a,s,m,vol_b,vol_a,dW,dV,e,all_vol_b,all_vol_a,all_vol_diff,OBI,OFI10s,signal_10s
Timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0 days 00:00:00,0.000,[],[],,,,,0,0,,,,0,0,0,,,1.0
0 days 00:00:00.527000,0.527,[],"[[105, 3]]",,105.0,,,0,3,,,,0,3,-3,-1.000000,,1.0
0 days 00:00:03.534000,3.534,"[[89, 10]]","[[105, 3]]",89.0,105.0,16.0,97.0,10,3,,0.0,,10,3,7,0.538462,,1.0
0 days 00:00:03.596000,3.596,"[[89, 10]]","[[100, 4], [105, 3]]",89.0,100.0,11.0,94.5,10,4,0.0,4.0,-4.0,10,7,3,0.428571,-4.0,1.0
0 days 00:00:03.689000,3.689,"[[89, 10]]","[[100, 4], [105, 3], [551, 3]]",89.0,100.0,11.0,94.5,10,4,0.0,0.0,0.0,10,10,0,0.428571,-4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0 days 08:29:58.767000,30598.767,"[[109, 8], [107, 7], [105, 4], [102, 13], [87,...","[[204, 4], [206, 4], [330, 4], [362, 3]]",109.0,204.0,95.0,156.5,8,4,0.0,4.0,-4.0,39,15,24,0.333333,-108.0,1.0
0 days 08:29:58.829000,30598.829,"[[109, 8], [107, 7], [105, 4], [102, 13], [87,...","[[204, 4], [206, 4], [330, 4], [362, 3]]",109.0,204.0,95.0,156.5,8,4,0.0,0.0,0.0,39,15,24,0.333333,-108.0,1.0
0 days 08:29:59.015000,30599.015,"[[109, 8], [107, 7], [105, 4], [102, 13], [88,...","[[204, 4], [206, 4], [330, 4], [362, 3]]",109.0,204.0,95.0,156.5,8,4,0.0,0.0,0.0,39,15,24,0.333333,-108.0,1.0
0 days 08:29:59.511000,30599.511,"[[109, 8], [107, 7], [105, 4], [102, 13], [88,...","[[204, 4], [206, 4], [330, 4], [463, 3]]",109.0,204.0,95.0,156.5,8,4,0.0,0.0,0.0,39,15,24,0.333333,-107.0,1.0


In [21]:
len(processed)

4

In [22]:
for i, date in enumerate(dates):
    processed[i].to_csv(f'./data/neural_network_test_data/cleaned_with_signals{date}.csv')