In [8]:
# Import the required library
import pandas as pd
import numpy as np
import ast
import os

In [19]:
def sigmoid(x):
    return 1/(1 + np.exp(-(x.astype(float))))

def string_to_nplist(x):
    if pd.isnull(x):
        return []
    else:
        return np.array(ast.literal_eval(x))

def read_data(file_name, col_names, col_list):
    data = pd.read_csv(file_name, names=col_names, delimiter="|")
    for col in col_list:
        data[col] = data[col].apply(lambda x: string_to_nplist(x))
    data["DateTime"] = pd.to_datetime(data["DateTime"])
    data = data.set_index(["DateTime"])
    return data

def clean_lob(data, cols_need, cols_check, weight_mid_price=0.5, num_level=10):
    lst_valid_samples = []
    mid_prices = []
    for ind, row in data.iterrows():
        if len(row[cols_check[0]]) and len(row[cols_check[1]]):
            if (row[cols_check[0]].shape[0] == num_level) and (row[cols_check[1]].shape[0] == num_level):
                lst_valid_samples.append(ind)
                mid_p = weight_mid_price * row[cols_check[0]][0] + (1 - weight_mid_price) * row[cols_check[1]][0]
                mid_prices.append(mid_p)
    ret_data = pd.DataFrame(index=lst_valid_samples, data=data.loc[lst_valid_samples, cols_need])
    ret_data["Midprice"] = mid_prices
    return ret_data

def my_func(a):
    all_items = np.concatenate(a.values)
    return np.mean(all_items), np.std(all_items)

def zscore_normalization(data, cols_need, freq="5H", min_periods=4*12*60):
    z_score_cols, stat_data = [], []
    for col in cols_need:
        rolling_col = data[col].rolling(window=freq, min_periods=min_periods)
        col_lst_mean_std = [my_func(a) for a in rolling_col]
        mu_col = "Mu" + col
        std_col = "Std" + col
        z_score_col = "Zscore" + col
        tmp_data = pd.DataFrame(data=col_lst_mean_std, columns=[mu_col, std_col], index=data.index)
        tmp_data.index = tmp_data.index.shift(1, freq="H")
        idx_intersect = list(set(tmp_data.index).intersection(set(data.index)))
        tmp_data[col] = np.nan
        tmp_data.loc[idx_intersect, col] = data.loc[idx_intersect, col]
        tmp_data = tmp_data.dropna()
        tmp_data[z_score_col] = (tmp_data[col] - tmp_data[mu_col]) / tmp_data[std_col]
        tmp_data = tmp_data[[z_score_col]]
        stat_data.append(tmp_data)
        z_score_cols.append(z_score_col)
    ret_data = pd.concat(stat_data, axis=1)
    return z_score_cols, ret_data

This function cleans a given Pandas DataFrame containing limit order book (LOB) data. The input parameters are data, which is the Pandas DataFrame, cols_need, which are the columns of the DataFrame to keep, and cols_check, which are the columns to check for the specified number of levels (num_level). The function iterates through the rows of the DataFrame, checks if the specified columns have data and the correct number of levels, calculates the mid price for each valid row, and returns a new DataFrame with the valid samples and the specified columns, including the calculated mid prices.

In [9]:
def clean_lob(data, cols_need, cols_check, weight_mid_price=0.5, num_level=10):
    # Initialize lists to store valid sample indices and mid prices
    lst_valid_samples = []
    mid_prices = []

    # Iterate through the rows of the input DataFrame
    for ind, row in data.iterrows():
        # Check if both columns in cols_check have data
        if len(row[cols_check[0]]) and len(row[cols_check[1]]):
            # Check if both columns have the specified number of levels
            if (row[cols_check[0]].shape[0] == num_level) and (row[cols_check[1]].shape[0] == num_level):
                # Add the index to the list of valid samples
                lst_valid_samples.append(ind)

                # Calculate the mid price for the current row
                mid_p = weight_mid_price * row[cols_check[0]][0] + (1 - weight_mid_price) * row[cols_check[1]][0]
                mid_prices.append(mid_p)

    # Create a new DataFrame with the valid samples and the specified columns
    ret_data = pd.DataFrame(index=lst_valid_samples, data=data.loc[lst_valid_samples, cols_need])

    # Add the mid prices to the new DataFrame
    ret_data["Midprice"] = mid_prices

    # Return the cleaned DataFrame
    return ret_data

This function normalizes specified columns of a given Pandas DataFrame using the z-score method. The input parameters are data, which is the Pandas DataFrame, and cols_need, which are the columns of the DataFrame that need normalization. The function calculates rolling statistics for each column, computes the z-score for each value, and returns a new DataFrame with the z-score columns along with a list of the z-score column names.

In [10]:
def my_func(a):
    all_items = np.concatenate(a.values)
    return np.mean(all_items), np.std(all_items)

In [11]:
def zscore_normalization(data, cols_need, freq="5H", min_periods=4*12*60):
    # Initialize lists to store z-score column names and statistical data
    z_score_cols, stat_data = [], []

    # Iterate through the columns that need normalization
    for col in cols_need:
        # Calculate rolling statistics for the current column
        rolling_col = data[col].rolling(window=freq, min_periods=min_periods)
        col_lst_mean_std = [my_func(a) for a in rolling_col]

        # Create column names for mean, standard deviation, and z-score
        mu_col = "Mu" + col
        std_col = "Std" + col
        z_score_col = "Zscore" + col

        # Create a temporary DataFrame with mean and standard deviation columns
        tmp_data = pd.DataFrame(data=col_lst_mean_std, columns=[mu_col, std_col], index=data.index)
        tmp_data.index = tmp_data.index.shift(1, freq="H")

        # Intersect the indices of the original data and the temporary DataFrame
        idx_intersect = list(set(tmp_data.index).intersection(set(data.index)))

        # Add the original column to the temporary DataFrame and fill with NaN values
        tmp_data[col] = np.nan
        tmp_data.loc[idx_intersect, col] = data.loc[idx_intersect, col]

        # Drop rows with NaN values
        tmp_data = tmp_data.dropna()

        # Calculate the z-score for each value in the column
        tmp_data[z_score_col] = (tmp_data[col] - tmp_data[mu_col]) / tmp_data[std_col]

        # Keep only the z-score column in the temporary DataFrame
        tmp_data = tmp_data[[z_score_col]]

        # Append the temporary DataFrame to the list of statistical data
        stat_data.append(tmp_data)

        # Append the z-score column name to the list of z-score column names
        z_score_cols.append(z_score_col)

    # Concatenate the statistical data DataFrames along the columns axis
    ret_data = pd.concat(stat_data, axis=1)

    # Return the list of z-score column names and the concatenated DataFrame
    return z_score_cols, ret_data

# Read data

In [23]:
lob_path = "./"
file_name = os.path.join(lob_path, "LOB_NQU22-CME_2_1_10_10level.lob")

cols_LOB = ["DateTime","Open","High","Low","Last","Volume","NumTrades","BidVolume","AskVolume","SumBid","SumAsk","BidPrices","BidVolumes","AskPrices","AskVolumes"]

col_list_LOB = ["BidPrices","BidVolumes","AskPrices","AskVolumes"]

In [21]:
data = read_data(file_name, cols_LOB, col_list_LOB)
data.head().T

DateTime,2022-07-07 00:00:00,2022-07-07 00:00:10,2022-07-07 00:00:20,2022-07-07 00:00:30,2022-07-07 00:00:40
Open,11897.5,11892.25,11879.5,11876.25,11879.5
High,11898.25,11892.75,11880.5,11882.5,11882.25
Low,11892.75,11878.25,11876.25,11876.0,11878.75
Last,11893.0,11880.0,11876.75,11880.25,11882.25
Volume,53.0,231.0,85.0,89.0,22.0
NumTrades,51.0,211.0,85.0,89.0,22.0
BidVolume,34.0,166.0,38.0,36.0,8.0
AskVolume,19.0,65.0,47.0,53.0,14.0
SumBid,29.0,22.0,34.0,14.0,14.0
SumAsk,20.0,18.0,24.0,19.0,26.0


In [22]:
z_score_cols, ret_data = zscore_normalization(data, col_list_LOB)
ret_data.head().T

DateTime,2022-07-07 01:00:00,2022-07-07 01:00:10,2022-07-07 01:00:20,2022-07-07 01:00:30,2022-07-07 01:00:40
ZscoreBidPrices,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
ZscoreBidVolumes,"[-0.8421601936380788, -0.8421601936380788, -0....","[-0.9185437189662375, -0.9185437189662375, -0....","[-0.9838699100999077, -0.4472135954999581, -0....","[-0.8446690915113089, 0.30064493087690647, -0....","[-0.15904595068400093, 0.4526692442544647, 0.4..."
ZscoreAskPrices,"[-14.100290132411525, -13.75213482050013, -13....","[-1.1659835848465152, -1.1277546148515476, -1....","[-1.3638112690012405, -1.3283875996765329, -1....","[-2.5364514502181823, -2.4965073328919116, -2....","[-3.356760412096845, -3.312181920568336, -3.26..."
ZscoreAskVolumes,"[1.2909944487358056, -1.2909944487358056, 0.0,...","[0.13018891098082397, 0.13018891098082397, 1.4...","[-0.06274558051381605, -1.0039292882210538, -1...","[-0.022593461946228327, 1.7848834937520444, -0...","[-0.11665046633795345, -0.11665046633795345, 0..."


In [23]:
cols_check = [z_score_cols[0], z_score_cols[2]]
data_cleaned = clean_lob(ret_data, z_score_cols, cols_check)
data_cleaned.head().T

Unnamed: 0,2022-07-07 01:00:00,2022-07-07 01:00:10,2022-07-07 01:00:20,2022-07-07 01:00:30,2022-07-07 01:00:40
ZscoreBidPrices,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
ZscoreBidVolumes,"[-0.8421601936380788, -0.8421601936380788, -0....","[-0.9185437189662375, -0.9185437189662375, -0....","[-0.9838699100999077, -0.4472135954999581, -0....","[-0.8446690915113089, 0.30064493087690647, -0....","[-0.15904595068400093, 0.4526692442544647, 0.4..."
ZscoreAskPrices,"[-14.100290132411525, -13.75213482050013, -13....","[-1.1659835848465152, -1.1277546148515476, -1....","[-1.3638112690012405, -1.3283875996765329, -1....","[-2.5364514502181823, -2.4965073328919116, -2....","[-3.356760412096845, -3.312181920568336, -3.26..."
ZscoreAskVolumes,"[1.2909944487358056, -1.2909944487358056, 0.0,...","[0.13018891098082397, 0.13018891098082397, 1.4...","[-0.06274558051381605, -1.0039292882210538, -1...","[-0.022593461946228327, 1.7848834937520444, -0...","[-0.11665046633795345, -0.11665046633795345, 0..."
Midprice,-12.533591,-0.993953,-1.204405,-2.356703,-3.145377


In [24]:
def my_con(x):
    a = np.concatenate(x.values)
    lst2 = []
    for i in range(10):
        lst2.extend(a[[i, i+10, i+20, i+30]])
    return lst2

In [25]:
data_cleaned["ConcatLOB"] = data_cleaned[z_score_cols].apply(lambda x: my_con(x), axis=1)
data_cleaned.head().T

Unnamed: 0,2022-07-07 01:00:00,2022-07-07 01:00:10,2022-07-07 01:00:20,2022-07-07 01:00:30,2022-07-07 01:00:40
ZscoreBidPrices,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
ZscoreBidVolumes,"[-0.8421601936380788, -0.8421601936380788, -0....","[-0.9185437189662375, -0.9185437189662375, -0....","[-0.9838699100999077, -0.4472135954999581, -0....","[-0.8446690915113089, 0.30064493087690647, -0....","[-0.15904595068400093, 0.4526692442544647, 0.4..."
ZscoreAskPrices,"[-14.100290132411525, -13.75213482050013, -13....","[-1.1659835848465152, -1.1277546148515476, -1....","[-1.3638112690012405, -1.3283875996765329, -1....","[-2.5364514502181823, -2.4965073328919116, -2....","[-3.356760412096845, -3.312181920568336, -3.26..."
ZscoreAskVolumes,"[1.2909944487358056, -1.2909944487358056, 0.0,...","[0.13018891098082397, 0.13018891098082397, 1.4...","[-0.06274558051381605, -1.0039292882210538, -1...","[-0.022593461946228327, 1.7848834937520444, -0...","[-0.11665046633795345, -0.11665046633795345, 0..."
Midprice,-12.533591,-0.993953,-1.204405,-2.356703,-3.145377
ConcatLOB,"[-10.966892325208963, -0.8421601936380788, -14...","[-0.8219228548918058, -0.9185437189662375, -1....","[-1.0449982450788726, -0.9838699100999077, -1....","[-2.176954394281747, -0.8446690915113089, -2.5...","[-2.9339934066987317, -0.15904595068400093, -3..."


In [28]:
file_name_cleaned = os.path.join(lob_path, "LOB_cleaned.pkl")

data_cleaned = data_cleaned.sort_index()
data_cleaned.to_pickle(file_name_cleaned)

In [10]:
file_name_cleaned = os.path.join(".", "LOB_cleaned.pkl")

data_cleaned = pd.read_pickle(file_name_cleaned)

In [11]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14739 entries, 2022-07-07 01:00:00 to 2022-07-08 20:59:40
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ZscoreBidPrices   14739 non-null  object 
 1   ZscoreBidVolumes  14739 non-null  object 
 2   ZscoreAskPrices   14739 non-null  object 
 3   ZscoreAskVolumes  14739 non-null  object 
 4   Midprice          14739 non-null  float64
 5   ConcatLOB         14739 non-null  object 
dtypes: float64(1), object(5)
memory usage: 806.0+ KB


In [12]:
prediction_horizon = 60
data_cleaned["m_minus"] = data_cleaned["Midprice"].rolling(window=prediction_horizon).mean()
data_cleaned["m_plus"] = data_cleaned["m_minus"].shift(-prediction_horizon)
data_cleaned.tail().T

Unnamed: 0,2022-07-08 20:59:00,2022-07-08 20:59:10,2022-07-08 20:59:20,2022-07-08 20:59:30,2022-07-08 20:59:40
ZscoreBidPrices,"[0.4246719961798925, 0.4183511137719357, 0.412...","[0.41182293079465027, 0.40550226575290893, 0.3...","[0.3989429045882686, 0.3926224607361459, 0.386...","[0.45566412342846047, 0.44934384307390607, 0.4...","[0.5062004432275129, 0.4998801796558449, 0.493..."
ZscoreBidVolumes,"[-1.5604649686783543, -1.1268351819363955, -1....","[-1.7770194197331557, -1.3435119377897147, -0....","[-1.7752755445132846, -1.7752755445132846, -1....","[-1.5591648442688149, -1.7755385447671626, -1....","[-1.7756053747620826, -1.7756053747620826, -1...."
ZscoreAskPrices,"[0.36769219177335283, 0.37401294380460126, 0.3...","[0.35484535684981733, 0.3611658916311704, 0.36...","[0.3482879103924529, 0.35460822411755294, 0.36...","[0.41132942747657086, 0.41764957779461176, 0.4...","[0.46818495977477304, 0.4745050933240536, 0.48..."
ZscoreAskVolumes,"[-1.452628184058504, -1.452628184058504, -1.64...","[-0.6830091235846882, -1.645355249244121, -1.6...","[-1.6459119131401883, 0.471277897559481, -1.45...","[-1.6459014455441336, -1.453472123323319, -1.0...","[-1.6462583377058224, -1.6462583377058224, -1...."
Midprice,0.396182,0.383334,0.373615,0.433497,0.487193
ConcatLOB,"[0.4246719961798925, -1.5604649686783543, 0.36...","[0.41182293079465027, -1.7770194197331557, 0.3...","[0.3989429045882686, -1.7752755445132846, 0.34...","[0.45566412342846047, -1.5591648442688149, 0.4...","[0.5062004432275129, -1.7756053747620826, 0.46..."
m_minus,0.444813,0.443045,0.441275,0.440296,0.440584
m_plus,,,,,


In [13]:
data_cleaned.head().T

Unnamed: 0,2022-07-07 01:00:00,2022-07-07 01:00:10,2022-07-07 01:00:20,2022-07-07 01:00:30,2022-07-07 01:00:40
ZscoreBidPrices,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
ZscoreBidVolumes,"[-0.8421601936380788, -0.8421601936380788, -0....","[-0.9185437189662375, -0.9185437189662375, -0....","[-0.9838699100999077, -0.4472135954999581, -0....","[-0.8446690915113089, 0.30064493087690647, -0....","[-0.15904595068400093, 0.4526692442544647, 0.4..."
ZscoreAskPrices,"[-14.100290132411525, -13.75213482050013, -13....","[-1.1659835848465152, -1.1277546148515476, -1....","[-1.3638112690012405, -1.3283875996765329, -1....","[-2.5364514502181823, -2.4965073328919116, -2....","[-3.356760412096845, -3.312181920568336, -3.26..."
ZscoreAskVolumes,"[1.2909944487358056, -1.2909944487358056, 0.0,...","[0.13018891098082397, 0.13018891098082397, 1.4...","[-0.06274558051381605, -1.0039292882210538, -1...","[-0.022593461946228327, 1.7848834937520444, -0...","[-0.11665046633795345, -0.11665046633795345, 0..."
Midprice,-12.533591,-0.993953,-1.204405,-2.356703,-3.145377
ConcatLOB,"[-10.966892325208963, -0.8421601936380788, -14...","[-0.8219228548918058, -0.9185437189662375, -1....","[-1.0449982450788726, -0.9838699100999077, -1....","[-2.176954394281747, -0.8446690915113089, -2.5...","[-2.9339934066987317, -0.15904595068400093, -3..."
m_minus,,,,,
m_plus,-3.508181,-3.519689,-3.52905,-3.52637,-3.520625


In [14]:
data_cleaned = data_cleaned.dropna()
data_cleaned.shape

(14620, 8)

In [15]:
data_cleaned["label_3"] = (data_cleaned["m_plus"] - data_cleaned["Midprice"]) / data_cleaned["Midprice"]
data_cleaned["label_4"] = (data_cleaned["m_plus"] - data_cleaned["m_minus"]) / data_cleaned["m_minus"]
data_cleaned.tail().T

Unnamed: 0,2022-07-08 20:49:00,2022-07-08 20:49:10,2022-07-08 20:49:20,2022-07-08 20:49:30,2022-07-08 20:49:40
ZscoreBidPrices,"[0.5403236432116256, 0.533993827214457, 0.5276...","[0.5148131429673626, 0.5084834731767804, 0.502...","[0.508308988326925, 0.5019794332226488, 0.4956...","[0.5144566778223918, 0.5081272836111185, 0.501...","[0.4952662532499047, 0.48893701699421716, 0.48..."
ZscoreBidVolumes,"[-1.780151438736994, -1.780151438736994, -1.56...","[-1.7804654335673455, -1.5631647243093092, -1....","[-1.7810347399100017, -1.7810347399100017, -1....","[-1.7804758105257685, -1.5631590998549802, -1....","[-1.780657235223607, -1.563322801827044, -0.91..."
ZscoreAskPrices,"[0.48326138698953075, 0.48959107836445076, 0.4...","[0.46408225939921693, 0.4704118046719831, 0.47...","[0.4512497277373187, 0.4575791584188261, 0.463...","[0.47005729328714085, 0.4763865631752837, 0.48...","[0.4445392490529601, 0.4508683610962364, 0.457..."
ZscoreAskVolumes,"[-1.5001859900945707, -1.6996184006506394, -1....","[-1.6994233491749975, -1.5000112259854723, -1....","[-1.1005530342258671, -1.2999085199465636, -1....","[-0.5027540704442708, -0.9015190862192276, -1....","[-1.6985852500516185, -1.4992402818304016, -1...."
Midprice,0.511793,0.489448,0.479779,0.492257,0.469903
ConcatLOB,"[0.5403236432116256, -1.780151438736994, 0.483...","[0.5148131429673626, -1.7804654335673455, 0.46...","[0.508308988326925, -1.7810347399100017, 0.451...","[0.5144566778223918, -1.7804758105257685, 0.47...","[0.4952662532499047, -1.780657235223607, 0.444..."
m_minus,0.470042,0.470446,0.470528,0.470974,0.471205
m_plus,0.444813,0.443045,0.441275,0.440296,0.440584
label_3,-0.130872,-0.094807,-0.080254,-0.105557,-0.062393
label_4,-0.053673,-0.058247,-0.06217,-0.065137,-0.064985


In [16]:
alpha_threshold = 0.1
data_cleaned["alpha_3"] = 0
data_cleaned.loc[data_cleaned["label_3"] > alpha_threshold, "alpha_3"] = 1
data_cleaned.loc[data_cleaned["label_3"] < -alpha_threshold, "alpha_3"] = -1

data_cleaned["alpha_4"] = 0
data_cleaned.loc[data_cleaned["label_4"] > alpha_threshold, "alpha_4"] = 1
data_cleaned.loc[data_cleaned["label_4"] < -alpha_threshold, "alpha_4"] = -1

data_cleaned.tail().T

Unnamed: 0,2022-07-08 20:49:00,2022-07-08 20:49:10,2022-07-08 20:49:20,2022-07-08 20:49:30,2022-07-08 20:49:40
ZscoreBidPrices,"[0.5403236432116256, 0.533993827214457, 0.5276...","[0.5148131429673626, 0.5084834731767804, 0.502...","[0.508308988326925, 0.5019794332226488, 0.4956...","[0.5144566778223918, 0.5081272836111185, 0.501...","[0.4952662532499047, 0.48893701699421716, 0.48..."
ZscoreBidVolumes,"[-1.780151438736994, -1.780151438736994, -1.56...","[-1.7804654335673455, -1.5631647243093092, -1....","[-1.7810347399100017, -1.7810347399100017, -1....","[-1.7804758105257685, -1.5631590998549802, -1....","[-1.780657235223607, -1.563322801827044, -0.91..."
ZscoreAskPrices,"[0.48326138698953075, 0.48959107836445076, 0.4...","[0.46408225939921693, 0.4704118046719831, 0.47...","[0.4512497277373187, 0.4575791584188261, 0.463...","[0.47005729328714085, 0.4763865631752837, 0.48...","[0.4445392490529601, 0.4508683610962364, 0.457..."
ZscoreAskVolumes,"[-1.5001859900945707, -1.6996184006506394, -1....","[-1.6994233491749975, -1.5000112259854723, -1....","[-1.1005530342258671, -1.2999085199465636, -1....","[-0.5027540704442708, -0.9015190862192276, -1....","[-1.6985852500516185, -1.4992402818304016, -1...."
Midprice,0.511793,0.489448,0.479779,0.492257,0.469903
ConcatLOB,"[0.5403236432116256, -1.780151438736994, 0.483...","[0.5148131429673626, -1.7804654335673455, 0.46...","[0.508308988326925, -1.7810347399100017, 0.451...","[0.5144566778223918, -1.7804758105257685, 0.47...","[0.4952662532499047, -1.780657235223607, 0.444..."
m_minus,0.470042,0.470446,0.470528,0.470974,0.471205
m_plus,0.444813,0.443045,0.441275,0.440296,0.440584
label_3,-0.130872,-0.094807,-0.080254,-0.105557,-0.062393
label_4,-0.053673,-0.058247,-0.06217,-0.065137,-0.064985


In [17]:
print(data_cleaned["alpha_3"].value_counts())
print(data_cleaned["alpha_4"].value_counts())

alpha_3
-1    5731
 0    5056
 1    3833
Name: count, dtype: int64
alpha_4
-1    6811
 1    4133
 0    3676
Name: count, dtype: int64


In [18]:
data_cleaned["ConcatLOB"].head()

2022-07-07 01:10:00    [-1.3815674160523173, -0.07108593677517168, -1...
2022-07-07 01:10:10    [-1.5729477515229975, 1.9592763064403123, -1.9...
2022-07-07 01:10:20    [-1.4993965021077342, -0.7637205558836064, -1....
2022-07-07 01:10:30    [-1.5814297157199044, -0.12154525233627105, -1...
2022-07-07 01:10:50    [-2.0102524119030707, -0.7442558354537866, -2....
Name: ConcatLOB, dtype: object

In [19]:
def ex_X(r):
    if r.shape[0] < 100:
        return np.nan
    return np.vstack(r.values)

X_data = []
for r in data_cleaned["ConcatLOB"].rolling(window=100):
    X_data.append(ex_X(r))
data_cleaned["X"] = X_data
data_cleaned.head().T

Unnamed: 0,2022-07-07 01:10:00,2022-07-07 01:10:10,2022-07-07 01:10:20,2022-07-07 01:10:30,2022-07-07 01:10:50
ZscoreBidPrices,"[-1.3815674160523173, -1.4202259538409336, -1....","[-1.5729477515229975, -1.611214481204941, -1.6...","[-1.4993965021077342, -1.5373482817914577, -1....","[-1.5814297157199044, -1.6191318955694383, -1....","[-2.0102524119030707, -2.0480358831220955, -2...."
ZscoreBidVolumes,"[-0.07108593677517168, -0.8060422322472848, -0...","[1.9592763064403123, 0.5889984609885429, -0.78...","[-0.7637205558836064, -0.7637205558836064, -0....","[-0.12154525233627105, -0.7438569442979788, -0...","[-0.7442558354537866, -0.7442558354537866, -0...."
ZscoreAskPrices,"[-1.753343148734044, -1.7144578798473746, -1.6...","[-1.9094191740256414, -1.8708450492978508, -1....","[-1.8695077529964945, -1.8312740284586677, -1....","[-1.9506816582012707, -1.9126890756921773, -1....","[-2.3816408736701242, -2.3435899664655193, -2...."
ZscoreAskVolumes,"[0.0013298283873445038, -0.8098654878928129, -...","[-0.8138655439755674, 0.0026338690743545486, 0...","[0.0025989460292240958, -0.8160690531763689, 0...","[-0.8207826816681233, -0.8207826816681233, -0....","[-0.8165538560234376, 0.007491319780031493, -0..."
Midprice,-1.567455,-1.741183,-1.684452,-1.766056,-2.195947
ConcatLOB,"[-1.3815674160523173, -0.07108593677517168, -1...","[-1.5729477515229975, 1.9592763064403123, -1.9...","[-1.4993965021077342, -0.7637205558836064, -1....","[-1.5814297157199044, -0.12154525233627105, -1...","[-2.0102524119030707, -0.7442558354537866, -2...."
m_minus,-3.688054,-3.508181,-3.519689,-3.52905,-3.52637
m_plus,-2.867135,-2.90311,-2.931681,-2.951188,-2.951815
label_3,0.829165,0.66732,0.740436,0.671062,0.344211
label_4,-0.222589,-0.172474,-0.167062,-0.163744,-0.162931


In [20]:
data_cleaned.tail().T

Unnamed: 0,2022-07-08 20:49:00,2022-07-08 20:49:10,2022-07-08 20:49:20,2022-07-08 20:49:30,2022-07-08 20:49:40
ZscoreBidPrices,"[0.5403236432116256, 0.533993827214457, 0.5276...","[0.5148131429673626, 0.5084834731767804, 0.502...","[0.508308988326925, 0.5019794332226488, 0.4956...","[0.5144566778223918, 0.5081272836111185, 0.501...","[0.4952662532499047, 0.48893701699421716, 0.48..."
ZscoreBidVolumes,"[-1.780151438736994, -1.780151438736994, -1.56...","[-1.7804654335673455, -1.5631647243093092, -1....","[-1.7810347399100017, -1.7810347399100017, -1....","[-1.7804758105257685, -1.5631590998549802, -1....","[-1.780657235223607, -1.563322801827044, -0.91..."
ZscoreAskPrices,"[0.48326138698953075, 0.48959107836445076, 0.4...","[0.46408225939921693, 0.4704118046719831, 0.47...","[0.4512497277373187, 0.4575791584188261, 0.463...","[0.47005729328714085, 0.4763865631752837, 0.48...","[0.4445392490529601, 0.4508683610962364, 0.457..."
ZscoreAskVolumes,"[-1.5001859900945707, -1.6996184006506394, -1....","[-1.6994233491749975, -1.5000112259854723, -1....","[-1.1005530342258671, -1.2999085199465636, -1....","[-0.5027540704442708, -0.9015190862192276, -1....","[-1.6985852500516185, -1.4992402818304016, -1...."
Midprice,0.511793,0.489448,0.479779,0.492257,0.469903
ConcatLOB,"[0.5403236432116256, -1.780151438736994, 0.483...","[0.5148131429673626, -1.7804654335673455, 0.46...","[0.508308988326925, -1.7810347399100017, 0.451...","[0.5144566778223918, -1.7804758105257685, 0.47...","[0.4952662532499047, -1.780657235223607, 0.444..."
m_minus,0.470042,0.470446,0.470528,0.470974,0.471205
m_plus,0.444813,0.443045,0.441275,0.440296,0.440584
label_3,-0.130872,-0.094807,-0.080254,-0.105557,-0.062393
label_4,-0.053673,-0.058247,-0.06217,-0.065137,-0.064985


In [21]:
data_cleaned = data_cleaned.dropna()
data_cleaned.shape

(14521, 13)

In [24]:
file_name_ = os.path.join(lob_path, "LOB_labeled.pkl")

data_cleaned = data_cleaned.sort_index()
data_cleaned.to_pickle(file_name_)

In [25]:
data_cleaned.X.values[0].shape

(100, 40)

In [56]:
r.shape

(100,)

# Deep Learning

In [37]:
# load packages
import pandas as pd
import pickle
import numpy as np
import keras
import tensorflow as tf
from keras import backend as K
from keras.models import load_model, Model
from keras.layers import Flatten, Dense, Dropout, Activation, Input, LSTM, Reshape, Conv2D, MaxPooling2D
# from keras.optimizers import Adam
from tensorflow.keras.optimizers import Adam
from keras.layers import LeakyReLU
from keras.utils import np_utils

from sklearn import metrics
import matplotlib.pyplot as plt

# set random seeds
np.random.seed(1)
tf.random.set_seed(2)


In [64]:
def create_deeplob(T=100, NF=40, number_of_lstm=64):
    input_lmd = Input(shape=(T, NF, 1))
    
    # build the convolutional block
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(input_lmd)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(16, (1, 10))(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    
    # build the inception module
    convsecond_1 = Conv2D(32, (1, 1), padding='same')(conv_first1)
    convsecond_1 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_1)
    convsecond_1 = Conv2D(32, (3, 1), padding='same')(convsecond_1)
    convsecond_1 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_1)

    convsecond_2 = Conv2D(32, (1, 1), padding='same')(conv_first1)
    convsecond_2 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_2)
    convsecond_2 = Conv2D(32, (5, 1), padding='same')(convsecond_2)
    convsecond_2 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_2)

    convsecond_3 = MaxPooling2D((3, 1), strides=(1, 1), padding='same')(conv_first1)
    convsecond_3 = Conv2D(32, (1, 1), padding='same')(convsecond_3)
    convsecond_3 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_3)
    
    convsecond_output = keras.layers.concatenate([convsecond_1, convsecond_2, convsecond_3], axis=3)
    conv_reshape = Reshape((int(convsecond_output.shape[1]), int(convsecond_output.shape[3])))(convsecond_output)
    conv_reshape = keras.layers.Dropout(0.2, noise_shape=(None, 1, int(conv_reshape.shape[2])))(conv_reshape, training=True)

    # build the last LSTM layer
    conv_lstm = LSTM(number_of_lstm)(conv_reshape)

    # build the output layer
    out = Dense(10, activation='relu')(conv_lstm)
    out = Dense(3, activation='softmax')(out)
    # out = Dense(1, activation='tanh')(out)
    model = Model(inputs=input_lmd, outputs=out)
    adam = Adam(learning_rate=0.0001)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [39]:
number_of_lstm =  64

In [65]:
deeplob = create_deeplob()
deeplob.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 100, 40, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d_14 (Conv2D)             (None, 100, 20, 16)  48          ['input_2[0][0]']                
                                                                                                  
 leaky_re_lu_14 (LeakyReLU)     (None, 100, 20, 16)  0           ['conv2d_14[0][0]']              
                                                                                                  
 conv2d_15 (Conv2D)             (None, 100, 20, 16)  1040        ['leaky_re_lu_14[0][0]']   

In [41]:
X = data_cleaned["X"].values
Y = data_cleaned["alpha_3"].values
X.shape, Y.shape

((14521,), (14521,))

In [57]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(gap=prediction_horizon, n_splits=3, max_train_size=11000, test_size=1000)

for i, (train_index, test_index) in enumerate(tscv.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index=[{train_index[0]}, {train_index[-1]}]")
    print(f"  Test:  index=[{test_index[0]}, {test_index[-1]}]")

Fold 0:
  Train: index=[461, 11460]
  Test:  index=[11521, 12520]
Fold 1:
  Train: index=[1461, 12460]
  Test:  index=[12521, 13520]
Fold 2:
  Train: index=[2461, 13460]
  Test:  index=[13521, 14520]


In [29]:
X[0].shape

(100, 40)

In [30]:
def One1D_to_3D(data):
    N = data.shape[0]
    arr = np.empty((N, 100, 40, 1))
    for i in range(N):
        arr[i, :, :, 0] = data[i]
    return arr

In [60]:
for i, (train_index, test_index) in enumerate(tscv.split(X)):
    print(f"Fold {i}:")
    train_x =  X[train_index]
    train_x = One1D_to_3D(train_x)
    train_y = Y[train_index]
    train_y = np_utils.to_categorical(train_y, 3)
    test_x = X[test_index]
    test_y = Y[test_index]
    test_y = np_utils.to_categorical(test_y, 3)
    test_x = One1D_to_3D(test_x)
    print(train_x.shape, train_y.shape)


Fold 0:
(11000, 100, 40, 1) (11000, 3)
Fold 1:
(11000, 100, 40, 1) (11000, 3)
Fold 2:
(11000, 100, 40, 1) (11000, 3)


In [173]:
train_y[0]

array([0., 1., 0.], dtype=float32)

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
 
# Create a Sequential model
model = Sequential()
 
# Add a Flatten layer to flatten the input image
model.add(Flatten(input_shape=(100, 40, 1)))
 
# Add two dense layers with 200 units and 'relu' activation function
model.add(Dense(100, activation='relu'))
model.add(Dense(30, activation='relu'))
 
# Add a softmax output layer with 10 units
model.add(Dense(3, activation='softmax'))

adam = Adam(learning_rate=0.0001)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_2 (Flatten)         (None, 4000)              0         
                                                                 
 dense_7 (Dense)             (None, 100)               400100    
                                                                 
 dense_8 (Dense)             (None, 30)                3030      
                                                                 
 dense_9 (Dense)             (None, 3)                 93        
                                                                 
Total params: 403,223
Trainable params: 403,223
Non-trainable params: 0
_________________________________________________________________


In [49]:
X.shape, Y.shape

((14521,), (14521,))

In [46]:
t_x = One1D_to_3D(X)
t_y = np_utils.to_categorical(Y, 3)

In [61]:
history = model.fit(train_x, train_y, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [180]:
print("list of layers :",model.layers)
print("\nmodel.layers[1].name :",model.layers[1].name)
print("\nmodel.get_layer('dense_1') :",model.get_layer('dense_22'))

list of layers : [<keras.layers.reshaping.flatten.Flatten object at 0x000001BF1748BA90>, <keras.layers.core.dense.Dense object at 0x000001BF16A65E80>, <keras.layers.core.dense.Dense object at 0x000001BF1676CE80>]

model.layers[1].name : dense_22

model.get_layer('dense_1') : <keras.layers.core.dense.Dense object at 0x000001BF16A65E80>


In [67]:
deeplob = create_deeplob(train_x.shape[1], train_x.shape[2], number_of_lstm)
history_deeplob = deeplob.fit(train_x, train_y, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Model Testing

In [68]:
pred_d = deeplob.predict(test_x)



In [54]:
from keras import metrics

In [69]:
print('categorical_accuracy:', np.mean(metrics.categorical_accuracy(test_y, pred_d)))

categorical_accuracy: 0.405


In [62]:
test_x.shape

(1000, 100, 40, 1)

In [63]:
pred_m = model.predict(test_x)
print('categorical_accuracy:', np.mean(metrics.categorical_accuracy(test_y, pred_m)))

categorical_accuracy: 0.809
