In [1]:
import pandas as pd
import ast
import numpy as np

In [2]:
cols_LOB = ["DateTime","Open","High","Low","Last","Volume","NumTrades","BidVolume","AskVolume","SumBid","SumAsk","BidPrices","BidVolumes","AskPrices","AskVolumes"]

col_list_LOB = ["BidPrices","BidVolumes","AskPrices","AskVolumes"]

In [3]:
def sigmoid(x):
    return 1/(1 + np.exp(-(x.astype(float))))

def string_to_nplist(x):
    if pd.isnull(x):
        return []
    else:
        return np.array(ast.literal_eval(x))

In [4]:
def read_data(file_name, col_names, col_list):
    data = pd.read_csv(file_name, names=col_names, delimiter="|")
    for col in col_list:
        data[col] = data[col].apply(lambda x: string_to_nplist(x))
    data["DateTime"] = pd.to_datetime(data["DateTime"])
    data.set_index(["DateTime"], inplace=True)
    return data

def clean_lob(data, weight_mid_price=0.5, cols_need=["BidPrices","BidVolumes","AskPrices","AskVolumes"], num_level=10):
    lst_valid_samples = []
    mid_prices = []
    for ind, row in data.iterrows():
        if len(row["BidPrices"]) and len(row["AskPrices"]):
            if (row["BidPrices"].shape[0] == num_level) and (row["AskPrices"].shape[0] == num_level):
                lst_valid_samples.append(ind)
                mid_p = weight_mid_price * row["BidPrices"][0] + (1 - weight_mid_price) * row["AskPrices"][0]
                mid_prices.append(mid_p)
    ret_data = pd.DataFrame(index=lst_valid_samples, data=data.loc[lst_valid_samples, cols_need])
    ret_data["Midprice"] = mid_prices
    return ret_data

In [5]:
lob_path = "./"
file_name = lob_path + "LOB_NQU22-CME_2_1_10_10level.lob"

data = read_data(file_name, cols_LOB, col_list_LOB)
print(data.shape)
data.head().T

(15614, 14)


DateTime,2022-07-07 00:00:00,2022-07-07 00:00:10,2022-07-07 00:00:20,2022-07-07 00:00:30,2022-07-07 00:00:40
Open,11897.5,11892.25,11879.5,11876.25,11879.5
High,11898.25,11892.75,11880.5,11882.5,11882.25
Low,11892.75,11878.25,11876.25,11876.0,11878.75
Last,11893.0,11880.0,11876.75,11880.25,11882.25
Volume,53.0,231.0,85.0,89.0,22.0
NumTrades,51.0,211.0,85.0,89.0,22.0
BidVolume,34.0,166.0,38.0,36.0,8.0
AskVolume,19.0,65.0,47.0,53.0,14.0
SumBid,29.0,22.0,34.0,14.0,14.0
SumAsk,20.0,18.0,24.0,19.0,26.0


In [7]:
data = data.loc[data.index[:2000]]
data.shape

(2000, 14)

In [9]:
def zscore_normalization(data, freq="5D", min_periods=4*12*60):
    data["AvgBidPrices"] = data["BidPrices"].apply(lambda x: np.mean(x))
    data["AvgBidVolumes"] = data["BidVolumes"].apply(lambda x: np.mean(x))
    data["AvgAskPrices"] = data["AskPrices"].apply(lambda x: np.mean(x))
    data["AvgAskVolumes"] = data["AskVolumes"].apply(lambda x: np.mean(x))
    data["MuBidPrice"] = data["AvgBidPrices"].rolling(window=freq, min_periods=min_periods).mean()
    data["STDBidPrice"] = data["AvgBidPrices"].rolling(window=freq, min_periods=min_periods).std()
    data["MuBidVolume"] = data["AvgBidVolumes"].rolling(window=freq, min_periods=min_periods).mean()
    data["STDBidVolume"] = data["AvgBidVolumes"].rolling(window=freq, min_periods=min_periods).std()
    data["MuAskPrice"] = data["AvgAskPrices"].rolling(window=freq, min_periods=min_periods).mean()
    data["STDAskPrice"] = data["AvgAskPrices"].rolling(window=freq, min_periods=min_periods).std()
    data["MuAskVolume"] = data["AvgAskVolumes"].rolling(window=freq, min_periods=min_periods).mean()
    data["STDAskVolume"] = data["AvgAskVolumes"].rolling(window=freq, min_periods=min_periods).std()
    data["ZscoreBidPrices"] = (data["BidPrices"] - data["MuBidPrice"]) / data["STDBidPrice"]
    data["ZscoreBidVolumes"] = (data["BidVolumes"] - data["MuBidVolume"]) / data["STDBidVolume"]
    data["ZscoreAskPrices"] = (data["AskPrices"] - data["MuAskPrice"]) / data["STDAskPrice"]
    data["ZscoreAskVolumes"] = (data["AskVolumes"] - data["MuAskVolume"]) / data["STDAskVolume"]
    data["ConcatLOB"] =  data[["ZscoreAskPrices", "ZscoreAskVolumes", "ZscoreBidPrices", "ZscoreBidVolumes"]].apply(lambda x: func_concat_deeplob(x), axis=1)

In [24]:
def my_func(a):
    all_items = np.concatenate(a.values)
    return np.mean(all_items), np.std(all_items)

def zscore_normalization(data, cols_need, freq="5D", min_periods=4*12*60):
    z_score_cols, stat_data = [], []
    for col in cols_need:
        rolling_col = data[col].rolling(window=freq, min_periods=min_periods)
        col_lst_mean_std = [my_func(a) for a in rolling_col]
        mu_col = "Mu" + col
        std_col = "Std" + col
        z_score_col = "Zscore" + col
        tmp_data = pd.DataFrame(data=col_lst_mean_std, columns=[mu_col, std_col], index=data.index)
        tmp_data.index = tmp_data.index.shift(1, freq="D")
        idx_intersect = list(set(tmp_data.index).intersection(set(data.index)))
        tmp_data[col] = np.nan
        tmp_data.loc[idx_intersect, col] = data.loc[idx_intersect, col]
        tmp_data[z_score_col] = (tmp_data[col] - tmp_data[mu_col]) / tmp_data[std_col]
        stat_data.append(tmp_data)
        z_score_cols.append(z_score_col)
    ret_data = pd.concat(stat_data, axis=1)
    return z_score_cols, ret_data


In [25]:
# data["ConcatLOB"] = data[z_score_cols].apply(lambda x: np.concatenate(x.values), axis=1)

In [27]:
z_score_cols, ret_data = zscore_normalization(data, col_list_LOB)
ret_data.head().T

DateTime,2022-07-08 00:00:00,2022-07-08 00:00:10,2022-07-08 00:00:20,2022-07-08 00:00:30,2022-07-08 00:00:40
MuBidPrices,11891.625,11885.125,11881.875,11881.125,11880.975
StdBidPrices,0.71807,6.539543,7.057428,6.258744,5.615214
BidPrices,,,,,
ZscoreBidPrices,,,,,
MuBidVolumes,2.9,2.55,2.833333,2.475,2.26
StdBidVolumes,2.256103,1.687454,1.86339,1.746246,1.634748
BidVolumes,,,,,
ZscoreBidVolumes,,,,,
MuAskPrices,11894.375,11887.875,11884.625,11883.875,11883.825
StdAskPrices,0.71807,6.539543,7.057428,6.258744,5.608086


In [30]:
ret_data.tail().T

DateTime,2022-07-08 05:41:20,2022-07-08 05:41:30,2022-07-08 05:41:40,2022-07-08 05:41:50,2022-07-08 05:42:00
MuBidPrices,11888.974248,11888.990348,11889.005943,11889.022211,11889.038688
StdBidPrices,22.802807,22.808447,22.813393,22.819282,22.825474
BidPrices,,,,,
ZscoreBidPrices,,,,,
MuBidVolumes,2.193337,2.193691,2.194294,2.194297,2.1952
StdBidVolumes,1.49811,1.499805,1.50148,1.501264,1.50406
BidVolumes,,,,,
ZscoreBidVolumes,,,,,
MuAskPrices,11891.857202,11891.873235,11891.889127,11891.905378,11891.921862
StdAskPrices,22.821305,22.826838,22.832177,22.838027,22.844215


In [31]:
ret_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2000 entries, 2022-07-08 00:00:00 to 2022-07-08 05:42:00
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MuBidPrices       2000 non-null   float64
 1   StdBidPrices      2000 non-null   float64
 2   BidPrices         0 non-null      float64
 3   ZscoreBidPrices   0 non-null      float64
 4   MuBidVolumes      2000 non-null   float64
 5   StdBidVolumes     2000 non-null   float64
 6   BidVolumes        0 non-null      float64
 7   ZscoreBidVolumes  0 non-null      float64
 8   MuAskPrices       2000 non-null   float64
 9   StdAskPrices      2000 non-null   float64
 10  AskPrices         0 non-null      float64
 11  ZscoreAskPrices   0 non-null      float64
 12  MuAskVolumes      2000 non-null   float64
 13  StdAskVolumes     2000 non-null   float64
 14  AskVolumes        0 non-null      float64
 15  ZscoreAskVolumes  0 non-null      float64
dtypes: flo

In [16]:
import pandas as pd

# create sample DataFrame with datetime index
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}, index=pd.date_range('2022-05-01', periods=3))
print(df)

            col1  col2
2022-05-01     1     4
2022-05-02     2     5
2022-05-03     3     6


In [17]:
# shift the index by one day
df.index = df.index.shift(1, freq='D')

# print the resulting DataFrame
print(df)

            col1  col2
2022-05-02     1     4
2022-05-03     2     5
2022-05-04     3     6


In [None]:
data_cleaned = clean_lob(data)

data_cleaned.head()

In [34]:
import pandas as pd

# create sample DataFrame
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})

# define custom function that returns two values
def my_func(series):
    # compute sum and mean of series
    return series.sum(), 0

# apply custom function to rolling window of size 3 to column A
df_rolling = df['A'].rolling(window=3).apply(lambda x: pd.Series(my_func(x)))

# extract sum column from the resulting DataFrame
df_sum = df_rolling[0]

# print the resulting DataFrame
print(df_sum)

TypeError: cannot convert the series to <class 'float'>

In [None]:
datt groupby by date 
for d in days:
    for i in range(ind_0(d), ind_n(d)-100, step=1):
        sample = X[i: i+100]
        ...