In [32]:
import pandas as pd
import numpy as np
import ast
import os

In [33]:
def sigmoid(x):
    return 1/(1 + np.exp(-(x.astype(float))))

def string_to_nplist(x):
    if pd.isnull(x):
        return []
    else:
        return np.array(ast.literal_eval(x))

def read_data(file_name, col_names, col_list):
    data = pd.read_csv(file_name, names=col_names, delimiter="|")
    for col in col_list:
        data[col] = data[col].apply(lambda x: string_to_nplist(x))
    data["DateTime"] = pd.to_datetime(data["DateTime"])
    data.set_index(["DateTime"], inplace=True)
    return data

def clean_lob(data, cols_need, cols_check, weight_mid_price=0.5, num_level=10):
    lst_valid_samples = []
    mid_prices = []
    for ind, row in data.iterrows():
        if len(row[cols_check[0]]) and len(row[cols_check[1]]):
            if (row[cols_check[0]].shape[0] == num_level) and (row[cols_check[1]].shape[0] == num_level):
                lst_valid_samples.append(ind)
                mid_p = weight_mid_price * row[cols_check[0]][0] + (1 - weight_mid_price) * row[cols_check[1]][0]
                mid_prices.append(mid_p)
    ret_data = pd.DataFrame(index=lst_valid_samples, data=data.loc[lst_valid_samples, cols_need])
    ret_data["Midprice"] = mid_prices
    return ret_data

def my_func(a):
    all_items = np.concatenate(a.values)
    return np.mean(all_items), np.std(all_items)

def zscore_normalization(data, cols_need, freq="5H", min_periods=4*12*60):
    z_score_cols, stat_data = [], []
    for col in cols_need:
        rolling_col = data[col].rolling(window=freq, min_periods=min_periods)
        col_lst_mean_std = [my_func(a) for a in rolling_col]
        mu_col = "Mu" + col
        std_col = "Std" + col
        z_score_col = "Zscore" + col
        tmp_data = pd.DataFrame(data=col_lst_mean_std, columns=[mu_col, std_col], index=data.index)
        tmp_data.index = tmp_data.index.shift(1, freq="H")
        idx_intersect = list(set(tmp_data.index).intersection(set(data.index)))
        tmp_data[col] = np.nan
        tmp_data.loc[idx_intersect, col] = data.loc[idx_intersect, col]
        tmp_data = tmp_data.dropna()
        tmp_data[z_score_col] = (tmp_data[col] - tmp_data[mu_col]) / tmp_data[std_col]
        tmp_data = tmp_data[[z_score_col]]
        stat_data.append(tmp_data)
        z_score_cols.append(z_score_col)
    ret_data = pd.concat(stat_data, axis=1)
    return z_score_cols, ret_data

In [37]:
cols_LOB = ["DateTime","Open","High","Low","Last","Volume","NumTrades","BidVolume","AskVolume","SumBid","SumAsk","BidPrices","BidVolumes","AskPrices","AskVolumes"]
col_list_LOB = ["BidPrices","BidVolumes","AskPrices","AskVolumes"]

lob_path = "./"
file_name = os.path.join(lob_path, "LOB_NQU22-CME_2_1_10_10level.lob")

data = read_data(file_name, cols_LOB, col_list_LOB)
print(data.shape)
data = data.loc[data.index[:5000]]
print(data.shape)

(15614, 14)
(5000, 14)


In [38]:
z_score_cols, ret_data = zscore_normalization(data, col_list_LOB)

cols_check = [z_score_cols[0], z_score_cols[2]]
data_cleaned = clean_lob(ret_data, z_score_cols, cols_check)
print(data_cleaned.head())

                                                       ZscoreBidPrices   
2022-07-07 01:00:00  [-10.966892325208963, -11.31504763712036, -11....  \
2022-07-07 01:00:10  [-0.8219228548918058, -0.8601518248867736, -0....   
2022-07-07 01:00:20  [-1.0449982450788726, -1.08042191440358, -1.11...   
2022-07-07 01:00:30  [-2.176954394281747, -2.2168985116080178, -2.2...   
2022-07-07 01:00:40  [-2.9339934066987317, -2.9785153096835373, -3....   

                                                      ZscoreBidVolumes   
2022-07-07 01:00:00  [-0.8421601936380788, -0.8421601936380788, -0....  \
2022-07-07 01:00:10  [-0.9185437189662375, -0.9185437189662375, -0....   
2022-07-07 01:00:20  [-0.9838699100999077, -0.4472135954999581, -0....   
2022-07-07 01:00:30  [-0.8446690915113089, 0.30064493087690647, -0....   
2022-07-07 01:00:40  [-0.15904595068400093, 0.4526692442544647, 0.4...   

                                                       ZscoreAskPrices   
2022-07-07 01:00:00  [-14.1002901324

In [39]:
data_cleaned["ConcatLOB"] = data_cleaned[z_score_cols].apply(lambda x: np.concatenate(x.values), axis=1)
print(data_cleaned["ConcatLOB"].tail(1).values[0].shape)
print(data_cleaned["ConcatLOB"].tail(1).values[0])

(40,)
[ 6.95855473  6.93591802  6.9132813   6.89064458  6.86800786  6.84537115
  6.82273443  6.80009771  6.77746099  6.75482427 -0.91506907 -0.04382868
  1.26303192  1.69865211  1.26303192  2.13427231  2.56989251  2.56989251
  3.8767531   2.56989251  6.75709565  6.77977151  6.80244737  6.82512323
  6.84779908  6.87047494  6.8931508   6.91582666  6.93850252  6.96117838
 -0.5003192   0.85766126  0.40500111  0.85766126  1.31032142  0.85766126
  1.31032142  0.85766126  1.31032142  1.76298157]


In [43]:
prediction_horizon = 60
data_cleaned["m_minus"] = data_cleaned["Midprice"].rolling(window=prediction_horizon).mean()
data_cleaned["m_plus"] = data_cleaned["m_minus"].shift(-prediction_horizon)
data_cleaned.tail().T

Unnamed: 0,2022-07-07 14:03:00,2022-07-07 14:03:10,2022-07-07 14:03:20,2022-07-07 14:03:30,2022-07-07 14:03:40
ZscoreBidPrices,"[6.799000262128179, 6.776366442992226, 6.75373...","[7.115983684469852, 7.0933495957224615, 7.0707...","[7.161523027513145, 7.138888280765313, 7.11625...","[7.365495256027578, 7.342859866605833, 7.32022...","[6.958554733968971, 6.935918016269945, 6.91328..."
ZscoreBidVolumes,"[-0.04256691743525577, -0.04256691743525577, 0...","[0.8286413568259139, -0.9141822834083645, 0.82...","[0.39287845815094125, 1.2643252140569359, 0.82...","[-0.4790590604583265, -0.04339654444435253, 0....","[-0.9150690710981855, -0.04382867650062303, 1...."
ZscoreAskPrices,"[6.597189952622149, 6.6198626422075355, 6.6425...","[6.914713844349253, 6.9373867995408505, 6.9600...","[6.96044047708142, 6.983114384785773, 7.005788...","[7.164756844426476, 7.187431387379555, 7.21010...","[6.75709564773253, 6.779771507022835, 6.802447..."
ZscoreAskVolumes,"[-0.4994746915413527, -0.04730541159182592, -0...","[-0.04777996356818906, 0.8564917008029097, 1.7...","[-0.9522690966736874, -0.047766973557974976, -...","[-0.04783539300781414, 0.4045867187555654, 0.8...","[-0.5003191971745847, 0.8576612639915548, 0.40..."
Midprice,6.698095,7.015349,7.060982,7.265126,6.857825
ConcatLOB,"[6.799000262128179, 6.776366442992226, 6.75373...","[7.115983684469852, 7.0933495957224615, 7.0707...","[7.161523027513145, 7.138888280765313, 7.11625...","[7.365495256027578, 7.342859866605833, 7.32022...","[6.958554733968971, 6.935918016269945, 6.91328..."
m_minus,9.162353,9.131124,9.09501,9.0578,9.021094
m_plus,,,,,


In [44]:
data_cleaned.head().T

Unnamed: 0,2022-07-07 01:00:00,2022-07-07 01:00:10,2022-07-07 01:00:20,2022-07-07 01:00:30,2022-07-07 01:00:40
ZscoreBidPrices,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
ZscoreBidVolumes,"[-0.8421601936380788, -0.8421601936380788, -0....","[-0.9185437189662375, -0.9185437189662375, -0....","[-0.9838699100999077, -0.4472135954999581, -0....","[-0.8446690915113089, 0.30064493087690647, -0....","[-0.15904595068400093, 0.4526692442544647, 0.4..."
ZscoreAskPrices,"[-14.100290132411525, -13.75213482050013, -13....","[-1.1659835848465152, -1.1277546148515476, -1....","[-1.3638112690012405, -1.3283875996765329, -1....","[-2.5364514502181823, -2.4965073328919116, -2....","[-3.356760412096845, -3.312181920568336, -3.26..."
ZscoreAskVolumes,"[1.2909944487358056, -1.2909944487358056, 0.0,...","[0.13018891098082397, 0.13018891098082397, 1.4...","[-0.06274558051381605, -1.0039292882210538, -1...","[-0.022593461946228327, 1.7848834937520444, -0...","[-0.11665046633795345, -0.11665046633795345, 0..."
Midprice,-12.533591,-0.993953,-1.204405,-2.356703,-3.145377
ConcatLOB,"[-10.966892325208963, -11.31504763712036, -11....","[-0.8219228548918058, -0.8601518248867736, -0....","[-1.0449982450788726, -1.08042191440358, -1.11...","[-2.176954394281747, -2.2168985116080178, -2.2...","[-2.9339934066987317, -2.9785153096835373, -3...."
m_minus,,,,,
m_plus,-3.508181,-3.519689,-3.52905,-3.52637,-3.520625


In [45]:
data_cleaned = data_cleaned.dropna()
data_cleaned.shape

(4469, 8)

In [57]:
data_cleaned["label_3"] = (data_cleaned["m_plus"] - data_cleaned["Midprice"]) / data_cleaned["Midprice"]
data_cleaned["label_4"] = (data_cleaned["m_plus"] - data_cleaned["m_minus"]) / data_cleaned["m_minus"]
data_cleaned.tail().T

Unnamed: 0,2022-07-07 13:53:00,2022-07-07 13:53:10,2022-07-07 13:53:20,2022-07-07 13:53:30,2022-07-07 13:53:40
ZscoreBidPrices,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
ZscoreBidVolumes,"[-0.9013984447431412, -0.46087827963873806, 1....","[-0.46108631745742445, -0.020554864029647874, ...","[-0.46142336035606074, 0.4197210896168101, 0.4...","[-0.02087804616710578, -0.46150050111565893, 0...","[-0.9022515907724464, -0.020952924852332314, 1..."
ZscoreAskPrices,"[8.630504017191216, 8.65335950701698, 8.676214...","[8.788631155261942, 8.811484617859849, 8.83433...","[9.150422458430542, 9.173269386651288, 9.19611...","[9.397727808720106, 9.420567987925734, 9.44340...","[8.959977692989005, 8.982811203763553, 9.00564..."
ZscoreAskVolumes,"[-0.02753235696350738, 1.3460263404381427, 0.4...","[1.3458437378939903, 1.3458437378939903, 1.345...","[0.8868034713814793, 2.2597015272397125, 8.666...","[-0.9440582272606038, -0.029064796883633642, 1...","[0.4279043547223779, 1.8001730668491398, 1.800..."
Midprice,8.731174,8.88912,9.227815,9.497695,9.060216
ConcatLOB,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
m_minus,8.574187,8.594837,8.616163,8.635488,8.652895
m_plus,9.162353,9.131124,9.09501,9.0578,9.021094
label_3,0.049384,0.027225,-0.014392,-0.046316,-0.004318
label_4,0.068597,0.062396,0.055575,0.048904,0.042552


In [58]:
alpha_threshold = 0.1
data_cleaned["alpha_3"] = 0
data_cleaned.loc[data_cleaned["label_3"] > alpha_threshold, "alpha_3"] = 1
data_cleaned.loc[data_cleaned["label_3"] < -alpha_threshold, "alpha_3"] = -1

data_cleaned["alpha_4"] = 0
data_cleaned.loc[data_cleaned["label_4"] > alpha_threshold, "alpha_4"] = 1
data_cleaned.loc[data_cleaned["label_4"] < -alpha_threshold, "alpha_4"] = -1

data_cleaned.tail().T

Unnamed: 0,2022-07-07 13:53:00,2022-07-07 13:53:10,2022-07-07 13:53:20,2022-07-07 13:53:30,2022-07-07 13:53:40
ZscoreBidPrices,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
ZscoreBidVolumes,"[-0.9013984447431412, -0.46087827963873806, 1....","[-0.46108631745742445, -0.020554864029647874, ...","[-0.46142336035606074, 0.4197210896168101, 0.4...","[-0.02087804616710578, -0.46150050111565893, 0...","[-0.9022515907724464, -0.020952924852332314, 1..."
ZscoreAskPrices,"[8.630504017191216, 8.65335950701698, 8.676214...","[8.788631155261942, 8.811484617859849, 8.83433...","[9.150422458430542, 9.173269386651288, 9.19611...","[9.397727808720106, 9.420567987925734, 9.44340...","[8.959977692989005, 8.982811203763553, 9.00564..."
ZscoreAskVolumes,"[-0.02753235696350738, 1.3460263404381427, 0.4...","[1.3458437378939903, 1.3458437378939903, 1.345...","[0.8868034713814793, 2.2597015272397125, 8.666...","[-0.9440582272606038, -0.029064796883633642, 1...","[0.4279043547223779, 1.8001730668491398, 1.800..."
Midprice,8.731174,8.88912,9.227815,9.497695,9.060216
ConcatLOB,"[8.83184448196089, 8.809025348507737, 8.786206...","[8.989609419761655, 8.966792542780565, 8.94397...","[9.305207168373194, 9.282396680183234, 9.25958...","[9.597663112130787, 9.574859361553315, 9.55205...","[9.160453323339766, 9.137656467243026, 9.11485..."
m_minus,8.574187,8.594837,8.616163,8.635488,8.652895
m_plus,9.162353,9.131124,9.09501,9.0578,9.021094
label_3,0.049384,0.027225,-0.014392,-0.046316,-0.004318
label_4,0.068597,0.062396,0.055575,0.048904,0.042552


In [59]:
print(data_cleaned["alpha_3"].value_counts())
print(data_cleaned["alpha_4"].value_counts())


alpha_3
-1    1809
 1    1462
 0    1198
Name: count, dtype: int64
alpha_4
-1    2067
 1    1676
 0     726
Name: count, dtype: int64


In [16]:
import pandas as pd

# create sample DataFrame with datetime index
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}, index=pd.date_range('2022-05-01', periods=3))
print(df)

            col1  col2
2022-05-01     1     4
2022-05-02     2     5
2022-05-03     3     6


In [17]:
# shift the index by one day
df.index = df.index.shift(1, freq='D')

# print the resulting DataFrame
print(df)

            col1  col2
2022-05-02     1     4
2022-05-03     2     5
2022-05-04     3     6


In [None]:
datt groupby by date 
for d in days:
    for i in range(ind_0(d), ind_n(d)-100, step=1):
        sample = X[i: i+100]
        ...