In [54]:
import numpy as np
import pandas as pd
import csv
import re
import os 

In [55]:
PATH = "cleaned_data/"

def read_data(path):
    df_list = []
    files = os.listdir(path)
    files = sorted(files)
    for f in files:
        if re.search(r'\.csv', f) is not None:
            print("Reading {}...".format(f))
            df = pd.read_csv(path + f) 
            df = df.iloc[100:]
            df_list.append(df)
    print("{} files in total".format(len(df_list)))
    return df_list
df_list = read_data(PATH)
print(df_list)
df_list = df_list[:4]

Reading cleaned_with_signals2024-06-03.csv...
Reading cleaned_with_signals2024-06-04.csv...
Reading cleaned_with_signals2024-06-05.csv...
Reading cleaned_with_signals2024-06-06.csv...
Reading cleaned_with_signals2024-06-07.csv...
Reading cleaned_with_signals2024-06-10.csv...
Reading cleaned_with_signals2024-06-11.csv...
Reading cleaned_with_signals2024-06-12.csv...
Reading cleaned_with_signals2024-06-13.csv...
Reading cleaned_with_signals2024-06-14.csv...
Reading cleaned_with_signals2024-06-17.csv...
Reading cleaned_with_signals2024-06-18.csv...
Reading cleaned_with_signals2024-06-19.csv...
Reading cleaned_with_signals2024-06-21.csv...
14 files in total
[                    Timestamps  Timestamps.1  \
100     0 days 00:00:16.368000        16.368   
101     0 days 00:00:16.399000        16.399   
102     0 days 00:00:16.523000        16.523   
103     0 days 00:00:16.554000        16.554   
104     0 days 00:00:16.709000        16.709   
...                        ...           ...   
3

In [56]:
#reformart the data to get feature vectors of length 24
def reformat_data(df, n_level=6):
    X = []
    for i in range(df.shape[0]):
        feature = []
        if i % 30000 == 0:
            print("Processing {} block".format(i // 30000 + 1))
        bids = df.iloc[i].Bids
        asks = df.iloc[i].Asks
        re_parse_num = r'[0-9]*[.]?[0-9]+'
        res_bids = re.findall(re_parse_num, bids)
        res_asks = re.findall(re_parse_num, asks)
        tmp = []
        bids = []
        for r in res_bids:
            tmp.append(int(r))
            if len(tmp) == 2:
                bids.append(tmp)
                tmp = []
        asks = []
        for r in res_asks:
            tmp.append(int(r))
            if len(tmp) == 2:
                asks.append(tmp)
                tmp = []   
        level_bids = len(bids)
        level_asks = len(asks)
        if level_bids < n_level:
            for i in range(n_level - level_bids):
                bids.append([0, 0])
        else:
            bids = bids[:n_level]
        if level_asks < n_level:
            for i in range(n_level - level_asks):
                asks.append([0, 0])
        else:
            asks = asks[:n_level]
        Pa = [a[0] for a in asks] #ask prices 
        Va = [a[1] for a in asks] #ask volumes 
        Pb = [b[0] for b in bids] #bid prices 
        Vb = [b[1] for b in bids] #bid volumes 
        for i in range(n_level):
            feature.append(Pa[i])
            feature.append(Va[i])
            feature.append(Pb[i])
            feature.append(Vb[i])
        X.append(feature)
    return X

In [57]:
import time

n_level = 6 #use 6 levels of the limit order book data
start_time = time.time()
Xs = []
ys = []
for df in df_list:
    X = reformat_data(df, n_level=n_level)
    y = df.iloc[:].signal_10s
    assert len(X) == len(y)
    Xs.append(X)
    ys.append(y)
    print("Time spent: {}".format(time.time() - start_time))

Processing 1 block
Processing 2 block
Processing 3 block
Processing 4 block
Processing 5 block
Processing 6 block
Processing 7 block
Processing 8 block
Processing 9 block
Processing 10 block
Processing 11 block
Time spent: 74.56679391860962
Processing 1 block
Processing 2 block
Processing 3 block
Processing 4 block
Processing 5 block
Processing 6 block
Processing 7 block
Processing 8 block
Processing 9 block
Processing 10 block
Processing 11 block
Time spent: 151.14439988136292
Processing 1 block
Processing 2 block
Processing 3 block
Processing 4 block
Processing 5 block
Processing 6 block
Processing 7 block
Processing 8 block
Processing 9 block
Processing 10 block
Processing 11 block
Time spent: 229.8990342617035
Processing 1 block
Processing 2 block
Processing 3 block
Processing 4 block
Processing 5 block
Processing 6 block
Processing 7 block
Processing 8 block
Processing 9 block
Processing 10 block
Processing 11 block
Time spent: 307.157509803772


In [58]:
from sklearn import preprocessing

alpha = 3 #using data from the previous 3 days to normalise the data of the current day 

def data_normalization(Xs): 
    x_list = []
    normalised_x_list = []
    for i in range(len(Xs)): #convert to np array 
        tmp = Xs[i]
        tmp = np.array(tmp)
        x_list.append(tmp)
    for i in range(len(Xs)):
        if i < alpha:
            continue
        print("Normalising one day")
        scaler = preprocessing.StandardScaler()
        scaler.fit(np.concatenate(x_list[i-alpha:i]))
        tmp = scaler.transform(x_list[i])
        normalised_x_list.append(tmp)
    normalised_X = np.concatenate(normalised_x_list)
    return normalised_X
        
normalised_X = data_normalization(Xs)
y = ys[alpha:]
y = [item for sublist in y for item in sublist]
print(normalised_X.shape)
print(len(y))
assert normalised_X.shape[0] == len(y)

Normalising one day
(326758, 24)
326758


In [59]:
num_states = 10
stride = 1 

def truncating_data(x, y): #make it dividable by num_states
    valid_size = (len(x) // num_states) * num_states
    x = x[:valid_size]
    y = y[:valid_size]
    return x, y

x_data, y_data = truncating_data(normalised_X, y)

def create_data_by_rolling(features, signals):
    indices = np.arange(0, len(features)-stride, stride)
    data = []
    labels = []
    for idx in indices:
        if idx == 0:
            tmp = np.array(features[idx:idx+num_states])
            data.append(tmp)
            labels.append(signals[idx+num_states]) #label is created by getting the signal of the next state after the current 10 states
            continue
        else:
            if idx + num_states < len(features):
                tmp = np.array(features[idx:idx+num_states])
                data.append(tmp)
                labels.append(signals[idx+num_states])
    assert len(data) == len(labels)
    return data, labels

x_data, y_data = create_data_by_rolling(x_data, y_data)
x_data = np.stack(x_data, axis=0)
y_data = np.array(y_data)
print(x_data.shape)
print(y_data.shape)

(326740, 10, 24)
(326740,)


In [60]:
sells = [y for y in y_data if y==0]
neutrals = [y for y in y_data if y==1]
buys = [y for y in y_data if y==2]
print(len(sells))
print(len(neutrals))
print(len(buys))

29975
270391
26374


In [46]:
import pickle

save_filename = "normalised_data.pickle"
data = {'x': x_data, 'y': y_data}
with open(save_filename, 'wb') as f:
    pickle.dump(data, f)