In [7]:
# Import the required library
import pandas as pd
import numpy as np
import ast
import os

In [19]:
def sigmoid(x):
    return 1/(1 + np.exp(-(x.astype(float))))

def string_to_nplist(x):
    if pd.isnull(x):
        return []
    else:
        return np.array(ast.literal_eval(x))

def read_data(file_name, col_names, col_list):
    data = pd.read_csv(file_name, names=col_names, delimiter="|")
    for col in col_list:
        data[col] = data[col].apply(lambda x: string_to_nplist(x))
    data["DateTime"] = pd.to_datetime(data["DateTime"])
    data = data.set_index(["DateTime"])
    return data

def clean_lob(data, cols_need, cols_check, weight_mid_price=0.5, num_level=10):
    lst_valid_samples = []
    mid_prices = []
    for ind, row in data.iterrows():
        if len(row[cols_check[0]]) and len(row[cols_check[1]]):
            if (row[cols_check[0]].shape[0] == num_level) and (row[cols_check[1]].shape[0] == num_level):
                lst_valid_samples.append(ind)
                mid_p = weight_mid_price * row[cols_check[0]][0] + (1 - weight_mid_price) * row[cols_check[1]][0]
                mid_prices.append(mid_p)
    ret_data = pd.DataFrame(index=lst_valid_samples, data=data.loc[lst_valid_samples, cols_need])
    ret_data["Midprice"] = mid_prices
    return ret_data

def my_func(a):
    all_items = np.concatenate(a.values)
    return np.mean(all_items), np.std(all_items)

def zscore_normalization(data, cols_need, freq="5H", min_periods=4*12*60):
    z_score_cols, stat_data = [], []
    for col in cols_need:
        rolling_col = data[col].rolling(window=freq, min_periods=min_periods)
        col_lst_mean_std = [my_func(a) for a in rolling_col]
        mu_col = "Mu" + col
        std_col = "Std" + col
        z_score_col = "Zscore" + col
        tmp_data = pd.DataFrame(data=col_lst_mean_std, columns=[mu_col, std_col], index=data.index)
        tmp_data.index = tmp_data.index.shift(1, freq="H")
        idx_intersect = list(set(tmp_data.index).intersection(set(data.index)))
        tmp_data[col] = np.nan
        tmp_data.loc[idx_intersect, col] = data.loc[idx_intersect, col]
        tmp_data = tmp_data.dropna()
        tmp_data[z_score_col] = (tmp_data[col] - tmp_data[mu_col]) / tmp_data[std_col]
        tmp_data = tmp_data[[z_score_col]]
        stat_data.append(tmp_data)
        z_score_cols.append(z_score_col)
    ret_data = pd.concat(stat_data, axis=1)
    return z_score_cols, ret_data

This function cleans a given Pandas DataFrame containing limit order book (LOB) data. The input parameters are data, which is the Pandas DataFrame, cols_need, which are the columns of the DataFrame to keep, and cols_check, which are the columns to check for the specified number of levels (num_level). The function iterates through the rows of the DataFrame, checks if the specified columns have data and the correct number of levels, calculates the mid price for each valid row, and returns a new DataFrame with the valid samples and the specified columns, including the calculated mid prices.

In [9]:
def clean_lob(data, cols_need, cols_check, weight_mid_price=0.5, num_level=10):
    # Initialize lists to store valid sample indices and mid prices
    lst_valid_samples = []
    mid_prices = []

    # Iterate through the rows of the input DataFrame
    for ind, row in data.iterrows():
        # Check if both columns in cols_check have data
        if len(row[cols_check[0]]) and len(row[cols_check[1]]):
            # Check if both columns have the specified number of levels
            if (row[cols_check[0]].shape[0] == num_level) and (row[cols_check[1]].shape[0] == num_level):
                # Add the index to the list of valid samples
                lst_valid_samples.append(ind)

                # Calculate the mid price for the current row
                mid_p = weight_mid_price * row[cols_check[0]][0] + (1 - weight_mid_price) * row[cols_check[1]][0]
                mid_prices.append(mid_p)

    # Create a new DataFrame with the valid samples and the specified columns
    ret_data = pd.DataFrame(index=lst_valid_samples, data=data.loc[lst_valid_samples, cols_need])

    # Add the mid prices to the new DataFrame
    ret_data["Midprice"] = mid_prices

    # Return the cleaned DataFrame
    return ret_data

This function normalizes specified columns of a given Pandas DataFrame using the z-score method. The input parameters are data, which is the Pandas DataFrame, and cols_need, which are the columns of the DataFrame that need normalization. The function calculates rolling statistics for each column, computes the z-score for each value, and returns a new DataFrame with the z-score columns along with a list of the z-score column names.

In [10]:
def my_func(a):
    all_items = np.concatenate(a.values)
    return np.mean(all_items), np.std(all_items)

In [11]:
def zscore_normalization(data, cols_need, freq="5H", min_periods=4*12*60):
    # Initialize lists to store z-score column names and statistical data
    z_score_cols, stat_data = [], []

    # Iterate through the columns that need normalization
    for col in cols_need:
        # Calculate rolling statistics for the current column
        rolling_col = data[col].rolling(window=freq, min_periods=min_periods)
        col_lst_mean_std = [my_func(a) for a in rolling_col]

        # Create column names for mean, standard deviation, and z-score
        mu_col = "Mu" + col
        std_col = "Std" + col
        z_score_col = "Zscore" + col

        # Create a temporary DataFrame with mean and standard deviation columns
        tmp_data = pd.DataFrame(data=col_lst_mean_std, columns=[mu_col, std_col], index=data.index)
        tmp_data.index = tmp_data.index.shift(1, freq="H")

        # Intersect the indices of the original data and the temporary DataFrame
        idx_intersect = list(set(tmp_data.index).intersection(set(data.index)))

        # Add the original column to the temporary DataFrame and fill with NaN values
        tmp_data[col] = np.nan
        tmp_data.loc[idx_intersect, col] = data.loc[idx_intersect, col]

        # Drop rows with NaN values
        tmp_data = tmp_data.dropna()

        # Calculate the z-score for each value in the column
        tmp_data[z_score_col] = (tmp_data[col] - tmp_data[mu_col]) / tmp_data[std_col]

        # Keep only the z-score column in the temporary DataFrame
        tmp_data = tmp_data[[z_score_col]]

        # Append the temporary DataFrame to the list of statistical data
        stat_data.append(tmp_data)

        # Append the z-score column name to the list of z-score column names
        z_score_cols.append(z_score_col)

    # Concatenate the statistical data DataFrames along the columns axis
    ret_data = pd.concat(stat_data, axis=1)

    # Return the list of z-score column names and the concatenated DataFrame
    return z_score_cols, ret_data

# Read data

In [14]:
lob_path = "./"
file_name = os.path.join(lob_path, "LOB_NQU22-CME_2_1_10_10level.lob")

cols_LOB = ["DateTime","Open","High","Low","Last","Volume","NumTrades","BidVolume","AskVolume","SumBid","SumAsk","BidPrices","BidVolumes","AskPrices","AskVolumes"]

col_list_LOB = ["BidPrices","BidVolumes","AskPrices","AskVolumes"]

In [21]:
data = read_data(file_name, cols_LOB, col_list_LOB)
data.head().T

DateTime,2022-07-07 00:00:00,2022-07-07 00:00:10,2022-07-07 00:00:20,2022-07-07 00:00:30,2022-07-07 00:00:40
Open,11897.5,11892.25,11879.5,11876.25,11879.5
High,11898.25,11892.75,11880.5,11882.5,11882.25
Low,11892.75,11878.25,11876.25,11876.0,11878.75
Last,11893.0,11880.0,11876.75,11880.25,11882.25
Volume,53.0,231.0,85.0,89.0,22.0
NumTrades,51.0,211.0,85.0,89.0,22.0
BidVolume,34.0,166.0,38.0,36.0,8.0
AskVolume,19.0,65.0,47.0,53.0,14.0
SumBid,29.0,22.0,34.0,14.0,14.0
SumAsk,20.0,18.0,24.0,19.0,26.0


In [22]:
z_score_cols, ret_data = zscore_normalization(data, col_list_LOB)
ret_data.head().T

DateTime,2022-07-07 01:00:00,2022-07-07 01:00:10,2022-07-07 01:00:20,2022-07-07 01:00:30,2022-07-07 01:00:40
ZscoreBidPrices,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
ZscoreBidVolumes,"[-0.8421601936380788, -0.8421601936380788, -0....","[-0.9185437189662375, -0.9185437189662375, -0....","[-0.9838699100999077, -0.4472135954999581, -0....","[-0.8446690915113089, 0.30064493087690647, -0....","[-0.15904595068400093, 0.4526692442544647, 0.4..."
ZscoreAskPrices,"[-14.100290132411525, -13.75213482050013, -13....","[-1.1659835848465152, -1.1277546148515476, -1....","[-1.3638112690012405, -1.3283875996765329, -1....","[-2.5364514502181823, -2.4965073328919116, -2....","[-3.356760412096845, -3.312181920568336, -3.26..."
ZscoreAskVolumes,"[1.2909944487358056, -1.2909944487358056, 0.0,...","[0.13018891098082397, 0.13018891098082397, 1.4...","[-0.06274558051381605, -1.0039292882210538, -1...","[-0.022593461946228327, 1.7848834937520444, -0...","[-0.11665046633795345, -0.11665046633795345, 0..."


In [23]:
cols_check = [z_score_cols[0], z_score_cols[2]]
data_cleaned = clean_lob(ret_data, z_score_cols, cols_check)
data_cleaned.head().T

Unnamed: 0,2022-07-07 01:00:00,2022-07-07 01:00:10,2022-07-07 01:00:20,2022-07-07 01:00:30,2022-07-07 01:00:40
ZscoreBidPrices,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
ZscoreBidVolumes,"[-0.8421601936380788, -0.8421601936380788, -0....","[-0.9185437189662375, -0.9185437189662375, -0....","[-0.9838699100999077, -0.4472135954999581, -0....","[-0.8446690915113089, 0.30064493087690647, -0....","[-0.15904595068400093, 0.4526692442544647, 0.4..."
ZscoreAskPrices,"[-14.100290132411525, -13.75213482050013, -13....","[-1.1659835848465152, -1.1277546148515476, -1....","[-1.3638112690012405, -1.3283875996765329, -1....","[-2.5364514502181823, -2.4965073328919116, -2....","[-3.356760412096845, -3.312181920568336, -3.26..."
ZscoreAskVolumes,"[1.2909944487358056, -1.2909944487358056, 0.0,...","[0.13018891098082397, 0.13018891098082397, 1.4...","[-0.06274558051381605, -1.0039292882210538, -1...","[-0.022593461946228327, 1.7848834937520444, -0...","[-0.11665046633795345, -0.11665046633795345, 0..."
Midprice,-12.533591,-0.993953,-1.204405,-2.356703,-3.145377


In [24]:
def my_con(x):
    a = np.concatenate(x.values)
    lst2 = []
    for i in range(10):
        lst2.extend(a[[i, i+10, i+20, i+30]])
    return lst2

In [25]:
data_cleaned["ConcatLOB"] = data_cleaned[z_score_cols].apply(lambda x: my_con(x), axis=1)
data_cleaned.head().T

Unnamed: 0,2022-07-07 01:00:00,2022-07-07 01:00:10,2022-07-07 01:00:20,2022-07-07 01:00:30,2022-07-07 01:00:40
ZscoreBidPrices,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
ZscoreBidVolumes,"[-0.8421601936380788, -0.8421601936380788, -0....","[-0.9185437189662375, -0.9185437189662375, -0....","[-0.9838699100999077, -0.4472135954999581, -0....","[-0.8446690915113089, 0.30064493087690647, -0....","[-0.15904595068400093, 0.4526692442544647, 0.4..."
ZscoreAskPrices,"[-14.100290132411525, -13.75213482050013, -13....","[-1.1659835848465152, -1.1277546148515476, -1....","[-1.3638112690012405, -1.3283875996765329, -1....","[-2.5364514502181823, -2.4965073328919116, -2....","[-3.356760412096845, -3.312181920568336, -3.26..."
ZscoreAskVolumes,"[1.2909944487358056, -1.2909944487358056, 0.0,...","[0.13018891098082397, 0.13018891098082397, 1.4...","[-0.06274558051381605, -1.0039292882210538, -1...","[-0.022593461946228327, 1.7848834937520444, -0...","[-0.11665046633795345, -0.11665046633795345, 0..."
Midprice,-12.533591,-0.993953,-1.204405,-2.356703,-3.145377
ConcatLOB,"[-10.966892325208963, -0.8421601936380788, -14...","[-0.8219228548918058, -0.9185437189662375, -1....","[-1.0449982450788726, -0.9838699100999077, -1....","[-2.176954394281747, -0.8446690915113089, -2.5...","[-2.9339934066987317, -0.15904595068400093, -3..."


In [28]:
file_name_cleaned = os.path.join(lob_path, "LOB_cleaned.pkl")

data_cleaned = data_cleaned.sort_index()
data_cleaned.to_pickle(file_name_cleaned)

In [109]:
data_cleaned["ConcatLOB2"].values[:2]

array([list([-1.3815674160523173, -0.07108593677517168, -1.753343148734044, 0.0013298283873445038, -1.4202259538409336, -0.8060422322472848, -1.7144578798473746, -0.8098654878928129, -1.4588844916295503, -0.8060422322472848, -1.675572610960705, -0.8098654878928129, -1.4975430294181669, -0.8060422322472848, -1.6366873420740355, 0.0013298283873445038, -1.5362015672067832, 1.3988266541690544, -1.597802073187366, 0.0013298283873445038, -1.5748601049953999, -0.8060422322472848, -1.5589168043006965, 0.0013298283873445038, -1.6135186427840165, -0.8060422322472848, -1.5200315354140268, 0.0013298283873445038, -1.6908357183612495, -0.8060422322472848, -1.4811462665273571, 0.8125251446675018, -1.729494256149866, -0.07108593677517168, -1.4422609976406877, 1.6237204609476592, -1.7681527939384825, -0.07108593677517168, -1.403375728754018, 0.0013298283873445038]),
       list([-1.5729477515229975, 1.9592763064403123, -1.9094191740256414, -0.8138655439755674, -1.611214481204941, 0.5889984609885429, -1

In [132]:
data_cleaned.shape

(4469, 13)

In [135]:
roll_100 = data_cleaned["ConcatLOB2"].rolling(window=100, min_periods=100)
s = []
for i, r in enumerate(roll_100):
    if r.shape[0] < 100:
        return np.nan
    return np.vstack(r.values)
i

4468

In [142]:
data_cleaned.head().T

Unnamed: 0,2022-07-07 01:10:00,2022-07-07 01:10:10,2022-07-07 01:10:20,2022-07-07 01:10:30,2022-07-07 01:10:50
ZscoreBidPrices,"[-1.3815674160523173, -1.4202259538409336, -1....","[-1.5729477515229975, -1.611214481204941, -1.6...","[-1.4993965021077342, -1.5373482817914577, -1....","[-1.5814297157199044, -1.6191318955694383, -1....","[-2.0102524119030707, -2.0480358831220955, -2...."
ZscoreBidVolumes,"[-0.07108593677517168, -0.8060422322472848, -0...","[1.9592763064403123, 0.5889984609885429, -0.78...","[-0.7637205558836064, -0.7637205558836064, -0....","[-0.12154525233627105, -0.7438569442979788, -0...","[-0.7442558354537866, -0.7442558354537866, -0...."
ZscoreAskPrices,"[-1.753343148734044, -1.7144578798473746, -1.6...","[-1.9094191740256414, -1.8708450492978508, -1....","[-1.8695077529964945, -1.8312740284586677, -1....","[-1.9506816582012707, -1.9126890756921773, -1....","[-2.3816408736701242, -2.3435899664655193, -2...."
ZscoreAskVolumes,"[0.0013298283873445038, -0.8098654878928129, -...","[-0.8138655439755674, 0.0026338690743545486, 0...","[0.0025989460292240958, -0.8160690531763689, 0...","[-0.8207826816681233, -0.8207826816681233, -0....","[-0.8165538560234376, 0.007491319780031493, -0..."
Midprice,-1.567455,-1.741183,-1.684452,-1.766056,-2.195947
ConcatLOB,"[-1.3815674160523173, -1.4202259538409336, -1....","[-1.5729477515229975, -1.611214481204941, -1.6...","[-1.4993965021077342, -1.5373482817914577, -1....","[-1.5814297157199044, -1.6191318955694383, -1....","[-2.0102524119030707, -2.0480358831220955, -2...."
m_minus,-3.688054,-3.508181,-3.519689,-3.52905,-3.52637
m_plus,-2.867135,-2.90311,-2.931681,-2.951188,-2.951815
label_3,0.829165,0.66732,0.740436,0.671062,0.344211
label_4,-0.222589,-0.172474,-0.167062,-0.163744,-0.162931


In [138]:
def ex_X(r):
    if r.shape[0] < 100:
        return np.nan
    return np.vstack(r.values)

In [141]:
roll_100 = data_cleaned["ConcatLOB2"].rolling(window=100, min_periods=100)
X = [ex_X(x) for x in roll_100]
X[-1]

array([[ 6.57006476, -0.44567514,  6.36390929, ...,  3.08473637,
         6.57019046,  1.34429884],
       [ 6.5942486 , -0.44545989,  6.38806317, ...,  2.64403191,
         6.59435698,  3.59703335],
       [ 6.64067183, -0.4456694 ,  6.43461955, ...,  1.31970827,
         6.64090799,  3.1476487 ],
       ...,
       [ 9.30520717, -0.46142336,  9.15042246, ...,  1.30086554,
         9.35604481,  4.09023227],
       [ 9.59766311, -0.02087805,  9.39772781, ...,  2.18223423,
         9.60328942,  5.46089579],
       [ 9.16045332, -0.90225159,  8.95997769, ...,  3.06359241,
         9.16547929,  1.34275016]])

In [136]:
f

2022-07-07 01:10:00    [-1.3815674160523173, -0.07108593677517168, -1...
Name: ConcatLOB2, dtype: object

In [134]:
np.min(s)

1

In [130]:
r

2022-07-07 13:37:10    [6.570064763621899, -0.4456751381946407, 6.363...
2022-07-07 13:37:20    [6.59424860151226, -0.4454598947271492, 6.3880...
2022-07-07 13:37:30    [6.640671829448539, -0.4456693953876572, 6.434...
2022-07-07 13:37:40    [6.916315375585285, -0.8871311157947338, 6.710...
2022-07-07 13:37:50    [6.825529375153507, -0.44658318570471056, 6.61...
                                             ...                        
2022-07-07 13:53:00    [8.83184448196089, -0.9013984447431412, 8.6305...
2022-07-07 13:53:10    [8.989609419761655, -0.46108631745742445, 8.78...
2022-07-07 13:53:20    [9.305207168373194, -0.46142336035606074, 9.15...
2022-07-07 13:53:30    [9.597663112130787, -0.02087804616710578, 9.39...
2022-07-07 13:53:40    [9.160453323339766, -0.9022515907724464, 8.959...
Name: ConcatLOB2, Length: 100, dtype: object

In [122]:
l = [np.array(a) for a in r.values]
b = np.vstack(r.values)
b

array([[ 6.57006476, -0.44567514,  6.36390929, ...,  3.08473637,
         6.57019046,  1.34429884],
       [ 6.5942486 , -0.44545989,  6.38806317, ...,  2.64403191,
         6.59435698,  3.59703335],
       [ 6.64067183, -0.4456694 ,  6.43461955, ...,  1.31970827,
         6.64090799,  3.1476487 ],
       ...,
       [ 9.30520717, -0.46142336,  9.15042246, ...,  1.30086554,
         9.35604481,  4.09023227],
       [ 9.59766311, -0.02087805,  9.39772781, ...,  2.18223423,
         9.60328942,  5.46089579],
       [ 9.16045332, -0.90225159,  8.95997769, ...,  3.06359241,
         9.16547929,  1.34275016]])

In [127]:
b[99, :]

array([ 9.16045332, -0.90225159,  8.95997769,  0.42790435,  9.13765647,
       -0.02095292,  8.9828112 ,  1.80017307,  9.11485961,  1.74164441,
        9.00564471,  1.80017307,  9.09206276,  2.18229374,  9.02847823,
        1.34275016,  9.0692659 ,  3.50424174,  9.05131174,  0.88532726,
        9.04646904,  2.62294307,  9.07414525,  0.42790435,  9.02367219,
        2.18229374,  9.09697876,  0.42790435,  9.00087533,  2.18229374,
        9.11981227,  1.34275016,  8.97807847,  2.18229374,  9.14264578,
        1.80017307,  8.95528162,  3.06359241,  9.16547929,  1.34275016])

In [128]:
np.transpose(r.values[99])

array([ 9.16045332, -0.90225159,  8.95997769,  0.42790435,  9.13765647,
       -0.02095292,  8.9828112 ,  1.80017307,  9.11485961,  1.74164441,
        9.00564471,  1.80017307,  9.09206276,  2.18229374,  9.02847823,
        1.34275016,  9.0692659 ,  3.50424174,  9.05131174,  0.88532726,
        9.04646904,  2.62294307,  9.07414525,  0.42790435,  9.02367219,
        2.18229374,  9.09697876,  0.42790435,  9.00087533,  2.18229374,
        9.11981227,  1.34275016,  8.97807847,  2.18229374,  9.14264578,
        1.80017307,  8.95528162,  3.06359241,  9.16547929,  1.34275016])

In [39]:
data_cleaned["ConcatLOB"] = data_cleaned[z_score_cols].apply(lambda x: np.concatenate(x.values), axis=1)
print(data_cleaned["ConcatLOB"].tail(1).values[0].shape)
print(data_cleaned["ConcatLOB"].tail(1).values[0])

(40,)
[ 6.95855473  6.93591802  6.9132813   6.89064458  6.86800786  6.84537115
  6.82273443  6.80009771  6.77746099  6.75482427 -0.91506907 -0.04382868
  1.26303192  1.69865211  1.26303192  2.13427231  2.56989251  2.56989251
  3.8767531   2.56989251  6.75709565  6.77977151  6.80244737  6.82512323
  6.84779908  6.87047494  6.8931508   6.91582666  6.93850252  6.96117838
 -0.5003192   0.85766126  0.40500111  0.85766126  1.31032142  0.85766126
  1.31032142  0.85766126  1.31032142  1.76298157]


In [143]:
prediction_horizon = 60
data_cleaned["m_minus"] = data_cleaned["Midprice"].rolling(window=prediction_horizon).mean()
data_cleaned["m_plus"] = data_cleaned["m_minus"].shift(-prediction_horizon)
data_cleaned.tail().T

Unnamed: 0,2022-07-07 13:53:00,2022-07-07 13:53:10,2022-07-07 13:53:20,2022-07-07 13:53:30,2022-07-07 13:53:40
ZscoreBidPrices,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
ZscoreBidVolumes,"[-0.9013984447431412, -0.46087827963873806, 1....","[-0.46108631745742445, -0.020554864029647874, ...","[-0.46142336035606074, 0.4197210896168101, 0.4...","[-0.02087804616710578, -0.46150050111565893, 0...","[-0.9022515907724464, -0.020952924852332314, 1..."
ZscoreAskPrices,"[8.630504017191216, 8.65335950701698, 8.676214...","[8.788631155261942, 8.811484617859849, 8.83433...","[9.150422458430542, 9.173269386651288, 9.19611...","[9.397727808720106, 9.420567987925734, 9.44340...","[8.959977692989005, 8.982811203763553, 9.00564..."
ZscoreAskVolumes,"[-0.02753235696350738, 1.3460263404381427, 0.4...","[1.3458437378939903, 1.3458437378939903, 1.345...","[0.8868034713814793, 2.2597015272397125, 8.666...","[-0.9440582272606038, -0.029064796883633642, 1...","[0.4279043547223779, 1.8001730668491398, 1.800..."
Midprice,8.731174,8.88912,9.227815,9.497695,9.060216
ConcatLOB,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
m_minus,8.574187,8.594837,8.616163,8.635488,8.652895
m_plus,,,,,
label_3,0.049384,0.027225,-0.014392,-0.046316,-0.004318
label_4,0.068597,0.062396,0.055575,0.048904,0.042552


In [44]:
data_cleaned.head().T

Unnamed: 0,2022-07-07 01:00:00,2022-07-07 01:00:10,2022-07-07 01:00:20,2022-07-07 01:00:30,2022-07-07 01:00:40
ZscoreBidPrices,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
ZscoreBidVolumes,"[-0.8421601936380788, -0.8421601936380788, -0....","[-0.9185437189662375, -0.9185437189662375, -0....","[-0.9838699100999077, -0.4472135954999581, -0....","[-0.8446690915113089, 0.30064493087690647, -0....","[-0.15904595068400093, 0.4526692442544647, 0.4..."
ZscoreAskPrices,"[-14.100290132411525, -13.75213482050013, -13....","[-1.1659835848465152, -1.1277546148515476, -1....","[-1.3638112690012405, -1.3283875996765329, -1....","[-2.5364514502181823, -2.4965073328919116, -2....","[-3.356760412096845, -3.312181920568336, -3.26..."
ZscoreAskVolumes,"[1.2909944487358056, -1.2909944487358056, 0.0,...","[0.13018891098082397, 0.13018891098082397, 1.4...","[-0.06274558051381605, -1.0039292882210538, -1...","[-0.022593461946228327, 1.7848834937520444, -0...","[-0.11665046633795345, -0.11665046633795345, 0..."
Midprice,-12.533591,-0.993953,-1.204405,-2.356703,-3.145377
ConcatLOB,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
m_minus,,,,,
m_plus,-3.508181,-3.519689,-3.52905,-3.52637,-3.520625


In [144]:
data_cleaned = data_cleaned.dropna()
data_cleaned.shape

(4350, 13)

In [57]:
data_cleaned["label_3"] = (data_cleaned["m_plus"] - data_cleaned["Midprice"]) / data_cleaned["Midprice"]
data_cleaned["label_4"] = (data_cleaned["m_plus"] - data_cleaned["m_minus"]) / data_cleaned["m_minus"]
data_cleaned.tail().T

Unnamed: 0,2022-07-07 13:53:00,2022-07-07 13:53:10,2022-07-07 13:53:20,2022-07-07 13:53:30,2022-07-07 13:53:40
ZscoreBidPrices,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
ZscoreBidVolumes,"[-0.9013984447431412, -0.46087827963873806, 1....","[-0.46108631745742445, -0.020554864029647874, ...","[-0.46142336035606074, 0.4197210896168101, 0.4...","[-0.02087804616710578, -0.46150050111565893, 0...","[-0.9022515907724464, -0.020952924852332314, 1..."
ZscoreAskPrices,"[8.630504017191216, 8.65335950701698, 8.676214...","[8.788631155261942, 8.811484617859849, 8.83433...","[9.150422458430542, 9.173269386651288, 9.19611...","[9.397727808720106, 9.420567987925734, 9.44340...","[8.959977692989005, 8.982811203763553, 9.00564..."
ZscoreAskVolumes,"[-0.02753235696350738, 1.3460263404381427, 0.4...","[1.3458437378939903, 1.3458437378939903, 1.345...","[0.8868034713814793, 2.2597015272397125, 8.666...","[-0.9440582272606038, -0.029064796883633642, 1...","[0.4279043547223779, 1.8001730668491398, 1.800..."
Midprice,8.731174,8.88912,9.227815,9.497695,9.060216
ConcatLOB,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
m_minus,8.574187,8.594837,8.616163,8.635488,8.652895
m_plus,9.162353,9.131124,9.09501,9.0578,9.021094
label_3,0.049384,0.027225,-0.014392,-0.046316,-0.004318
label_4,0.068597,0.062396,0.055575,0.048904,0.042552


In [58]:
alpha_threshold = 0.1
data_cleaned["alpha_3"] = 0
data_cleaned.loc[data_cleaned["label_3"] > alpha_threshold, "alpha_3"] = 1
data_cleaned.loc[data_cleaned["label_3"] < -alpha_threshold, "alpha_3"] = -1

data_cleaned["alpha_4"] = 0
data_cleaned.loc[data_cleaned["label_4"] > alpha_threshold, "alpha_4"] = 1
data_cleaned.loc[data_cleaned["label_4"] < -alpha_threshold, "alpha_4"] = -1

data_cleaned.tail().T

Unnamed: 0,2022-07-07 13:53:00,2022-07-07 13:53:10,2022-07-07 13:53:20,2022-07-07 13:53:30,2022-07-07 13:53:40
ZscoreBidPrices,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
ZscoreBidVolumes,"[-0.9013984447431412, -0.46087827963873806, 1....","[-0.46108631745742445, -0.020554864029647874, ...","[-0.46142336035606074, 0.4197210896168101, 0.4...","[-0.02087804616710578, -0.46150050111565893, 0...","[-0.9022515907724464, -0.020952924852332314, 1..."
ZscoreAskPrices,"[8.630504017191216, 8.65335950701698, 8.676214...","[8.788631155261942, 8.811484617859849, 8.83433...","[9.150422458430542, 9.173269386651288, 9.19611...","[9.397727808720106, 9.420567987925734, 9.44340...","[8.959977692989005, 8.982811203763553, 9.00564..."
ZscoreAskVolumes,"[-0.02753235696350738, 1.3460263404381427, 0.4...","[1.3458437378939903, 1.3458437378939903, 1.345...","[0.8868034713814793, 2.2597015272397125, 8.666...","[-0.9440582272606038, -0.029064796883633642, 1...","[0.4279043547223779, 1.8001730668491398, 1.800..."
Midprice,8.731174,8.88912,9.227815,9.497695,9.060216
ConcatLOB,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
m_minus,8.574187,8.594837,8.616163,8.635488,8.652895
m_plus,9.162353,9.131124,9.09501,9.0578,9.021094
label_3,0.049384,0.027225,-0.014392,-0.046316,-0.004318
label_4,0.068597,0.062396,0.055575,0.048904,0.042552


In [59]:
print(data_cleaned["alpha_3"].value_counts())
print(data_cleaned["alpha_4"].value_counts())


alpha_3
-1    1809
 1    1462
 0    1198
Name: count, dtype: int64
alpha_4
-1    2067
 1    1676
 0     726
Name: count, dtype: int64


In [65]:
import pandas as pd

# create sample DataFrame with datetime index
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}, index=pd.date_range('2022-05-01', periods=3))
print(df)

            col1  col2
2022-05-01     1     4
2022-05-02     2     5
2022-05-03     3     6


In [66]:
# shift the index by one day
df.index = df.index.shift(1, freq='D')

# print the resulting DataFrame
print(df)

            col1  col2
2022-05-02     1     4
2022-05-03     2     5
2022-05-04     3     6


In [None]:
datt groupby by date 
for d in days:
    for i in range(ind_0(d), ind_n(d)-100, step=1):
        sample = X[i: i+100]
        ...

# Deep Learning

In [4]:
# load packages
import pandas as pd
import pickle
import numpy as np
import keras
import tensorflow as tf
from keras import backend as K
from keras.models import load_model, Model
from keras.layers import Flatten, Dense, Dropout, Activation, Input, LSTM, Reshape, Conv2D, MaxPooling2D
# from keras.optimizers import Adam
from tensorflow.keras.optimizers import Adam
from keras.layers import LeakyReLU
from keras.utils import np_utils

from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

# set random seeds
np.random.seed(1)
tf.random.set_seed(2)


In [5]:
def create_deeplob(T=100, NF=40, number_of_lstm=64):
    input_lmd = Input(shape=(T, NF, 1))
    
    # build the convolutional block
    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(input_lmd)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(16, (1, 2), strides=(1, 2))(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)

    conv_first1 = Conv2D(16, (1, 10))(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    conv_first1 = Conv2D(16, (4, 1), padding='same')(conv_first1)
    conv_first1 = keras.layers.LeakyReLU(alpha=0.01)(conv_first1)
    
    # build the inception module
    convsecond_1 = Conv2D(32, (1, 1), padding='same')(conv_first1)
    convsecond_1 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_1)
    convsecond_1 = Conv2D(32, (3, 1), padding='same')(convsecond_1)
    convsecond_1 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_1)

    convsecond_2 = Conv2D(32, (1, 1), padding='same')(conv_first1)
    convsecond_2 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_2)
    convsecond_2 = Conv2D(32, (5, 1), padding='same')(convsecond_2)
    convsecond_2 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_2)

    convsecond_3 = MaxPooling2D((3, 1), strides=(1, 1), padding='same')(conv_first1)
    convsecond_3 = Conv2D(32, (1, 1), padding='same')(convsecond_3)
    convsecond_3 = keras.layers.LeakyReLU(alpha=0.01)(convsecond_3)
    
    convsecond_output = keras.layers.concatenate([convsecond_1, convsecond_2, convsecond_3], axis=3)
    conv_reshape = Reshape((int(convsecond_output.shape[1]), int(convsecond_output.shape[3])))(convsecond_output)
    conv_reshape = keras.layers.Dropout(0.2, noise_shape=(None, 1, int(conv_reshape.shape[2])))(conv_reshape, training=True)

    # build the last LSTM layer
    conv_lstm = LSTM(number_of_lstm)(conv_reshape)

    # build the output layer
    out = Dense(10, activation='relu')(conv_lstm)
    out = Dense(3, activation='softmax')(out)
    model = Model(inputs=input_lmd, outputs=out)
    adam = Adam(learning_rate=0.0001)
    model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [6]:
deeplob = create_deeplob()
deeplob.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 100, 40, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d_14 (Conv2D)             (None, 100, 20, 16)  48          ['input_2[0][0]']                
                                                                                                  
 leaky_re_lu_14 (LeakyReLU)     (None, 100, 20, 16)  0           ['conv2d_14[0][0]']              
                                                                                                  
 conv2d_15 (Conv2D)             (None, 100, 20, 16)  1040        ['leaky_re_lu_14[0][0]']   

In [None]:
trainX

In [None]:
number_of_lstm = 64

In [None]:
deeplob = create_deeplob(trainX.shape[1], trainX.shape[2], number_of_lstm)
deeplob.summary()