In [1]:
import pandas as pd
import ast
import numpy as np

In [2]:
cols_LOB = ["DateTime","Open","High","Low","Last","Volume","NumTrades","BidVolume","AskVolume","SumBid","SumAsk","BidPrices","BidVolumes","AskPrices","AskVolumes"]

col_list_LOB = ["BidPrices","BidVolumes","AskPrices","AskVolumes"]

In [3]:
def sigmoid(x):
    return 1/(1 + np.exp(-(x.astype(float))))

def string_to_nplist(x):
    if pd.isnull(x):
        return []
    else:
        return np.array(ast.literal_eval(x))

In [4]:
def read_data(file_name, col_names, col_list):
    data = pd.read_csv(file_name, names=col_names, delimiter="|")
    for col in col_list:
        data[col] = data[col].apply(lambda x: string_to_nplist(x))
    data["DateTime"] = pd.to_datetime(data["DateTime"])
    data.set_index(["DateTime"], inplace=True)
    return data

def clean_lob(data, weight_mid_price=0.5, cols_need=["BidPrices","BidVolumes","AskPrices","AskVolumes"], num_level=10):
    lst_valid_samples = []
    mid_prices = []
    for ind, row in data.iterrows():
        if len(row["BidPrices"]) and len(row["AskPrices"]):
            if (row["BidPrices"].shape[0] == num_level) and (row["AskPrices"].shape[0] == num_level):
                lst_valid_samples.append(ind)
                mid_p = weight_mid_price * row["BidPrices"][0] + (1 - weight_mid_price) * row["AskPrices"][0]
                mid_prices.append(mid_p)
    ret_data = pd.DataFrame(index=lst_valid_samples, data=data.loc[lst_valid_samples, cols_need])
    ret_data["Midprice"] = mid_prices
    return ret_data

In [10]:
lob_path = "./"
file_name = lob_path + "LOB_NQU22-CME_2_1_10_10level.lob"

data = read_data(file_name, cols_LOB, col_list_LOB)
data.head()

Unnamed: 0_level_0,Open,High,Low,Last,Volume,NumTrades,BidVolume,AskVolume,SumBid,SumAsk,BidPrices,BidVolumes,AskPrices,AskVolumes
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-07-07 00:00:00,11897.5,11898.25,11892.75,11893.0,53.0,51.0,34.0,19.0,29.0,20.0,"[11892.75, 11892.5, 11892.25, 11892.0, 11891.7...","[1, 2, 2, 3, 1, 4, 1, 9, 3, 3]","[11893.25, 11893.5, 11893.75, 11894.0, 11894.2...","[1, 2, 3, 1, 1, 2, 2, 2, 3, 3]"
2022-07-07 00:00:10,11892.25,11892.75,11878.25,11880.0,231.0,211.0,166.0,65.0,22.0,18.0,"[11879.75, 11879.5, 11879.25, 11879.0, 11878.7...","[1, 2, 2, 3, 2, 2, 3, 3, 2, 2]","[11880.25, 11880.5, 11880.75, 11881.0, 11881.2...","[1, 1, 2, 2, 3, 3, 1, 2, 1, 2]"
2022-07-07 00:00:20,11879.5,11880.5,11876.25,11876.75,85.0,85.0,38.0,47.0,34.0,24.0,"[11876.5, 11876.25, 11876.0, 11875.75, 11875.5...","[1, 1, 6, 1, 3, 5, 4, 7, 2, 4]","[11877.0, 11877.25, 11877.5, 11877.75, 11878.0...","[2, 4, 1, 1, 3, 2, 5, 1, 4, 1]"
2022-07-07 00:00:30,11876.25,11882.5,11876.0,11880.25,89.0,89.0,36.0,53.0,14.0,19.0,"[11880.0, 11879.75, 11879.5, 11879.25, 11879.0...","[2, 2, 1, 1, 2, 1, 1, 2, 1, 1]","[11880.5, 11880.75, 11881.0, 11881.25, 11881.5...","[3, 4, 1, 2, 1, 1, 1, 1, 4, 1]"
2022-07-07 00:00:40,11879.5,11882.25,11878.75,11882.25,22.0,22.0,8.0,14.0,14.0,26.0,"[11881.5, 11881.25, 11881.0, 11880.75, 11880.5...","[1, 2, 2, 1, 1, 2, 1, 1, 1, 2]","[11882.5, 11882.75, 11883.0, 11883.25, 11883.5...","[5, 1, 1, 4, 1, 4, 2, 2, 2, 4]"


In [11]:
data_cleaned = clean_lob(data)

data_cleaned.head()

Unnamed: 0,BidPrices,BidVolumes,AskPrices,AskVolumes,Midprice
2022-07-07 00:00:00,"[11892.75, 11892.5, 11892.25, 11892.0, 11891.7...","[1, 2, 2, 3, 1, 4, 1, 9, 3, 3]","[11893.25, 11893.5, 11893.75, 11894.0, 11894.2...","[1, 2, 3, 1, 1, 2, 2, 2, 3, 3]",11893.0
2022-07-07 00:00:10,"[11879.75, 11879.5, 11879.25, 11879.0, 11878.7...","[1, 2, 2, 3, 2, 2, 3, 3, 2, 2]","[11880.25, 11880.5, 11880.75, 11881.0, 11881.2...","[1, 1, 2, 2, 3, 3, 1, 2, 1, 2]",11880.0
2022-07-07 00:00:20,"[11876.5, 11876.25, 11876.0, 11875.75, 11875.5...","[1, 1, 6, 1, 3, 5, 4, 7, 2, 4]","[11877.0, 11877.25, 11877.5, 11877.75, 11878.0...","[2, 4, 1, 1, 3, 2, 5, 1, 4, 1]",11876.75
2022-07-07 00:00:30,"[11880.0, 11879.75, 11879.5, 11879.25, 11879.0...","[2, 2, 1, 1, 2, 1, 1, 2, 1, 1]","[11880.5, 11880.75, 11881.0, 11881.25, 11881.5...","[3, 4, 1, 2, 1, 1, 1, 1, 4, 1]",11880.25
2022-07-07 00:00:40,"[11881.5, 11881.25, 11881.0, 11880.75, 11880.5...","[1, 2, 2, 1, 1, 2, 1, 1, 1, 2]","[11882.5, 11882.75, 11883.0, 11883.25, 11883.5...","[5, 1, 1, 4, 1, 4, 2, 2, 2, 4]",11882.0


In [39]:
def zscore_normalization(data, freq="5D", min_periods=4*12*60):
    data["AvgBidPrices"] = data["BidPrices"].apply(lambda x: np.mean(x))
    data["AvgBidVolumes"] = data["BidVolumes"].apply(lambda x: np.mean(x))
    data["AvgAskPrices"] = data["AskPrices"].apply(lambda x: np.mean(x))
    data["AvgAskVolumes"] = data["AskVolumes"].apply(lambda x: np.mean(x))
    data["MuBidPrice"] = data["AvgBidPrices"].rolling(window=freq, min_periods=min_periods).mean()
    data["STDBidPrice"] = data["AvgBidPrices"].rolling(window=freq, min_periods=min_periods).std()
    data["MuBidVolume"] = data["AvgBidVolumes"].rolling(window=freq, min_periods=min_periods).mean()
    data["STDBidVolume"] = data["AvgBidVolumes"].rolling(window=freq, min_periods=min_periods).std()
    data["MuAskPrice"] = data["AvgAskPrices"].rolling(window=freq, min_periods=min_periods).mean()
    data["STDAskPrice"] = data["AvgAskPrices"].rolling(window=freq, min_periods=min_periods).std()
    data["MuAskVolume"] = data["AvgAskVolumes"].rolling(window=freq, min_periods=min_periods).mean()
    data["STDAskVolume"] = data["AvgAskVolumes"].rolling(window=freq, min_periods=min_periods).std()
    data["ZscoreBidPrices"] = (data["BidPrices"] - data["MuBidPrice"]) / data["STDBidPrice"]
    data["ZscoreBidVolumes"] = (data["BidVolumes"] - data["MuBidVolume"]) / data["STDBidVolume"]
    data["ZscoreAskPrices"] = (data["AskPrices"] - data["MuAskPrice"]) / data["STDAskPrice"]
    data["ZscoreAskVolumes"] = (data["AskVolumes"] - data["MuAskVolume"]) / data["STDAskVolume"]
    data["ConcatLOB"] =  data[["ZscoreAskPrices", "ZscoreAskVolumes", "ZscoreBidPrices", "ZscoreBidVolumes"]].apply(lambda x: func_concat_deeplob(x), axis=1)

def my_func(a):
    all_items = np.concatenate(a.values)
    return np.mean(all_items), np.std(all_items)

In [48]:
def my_func(a):
    all_items = np.concatenate(a.values)
    return np.mean(all_items), np.std(all_items)

def zscore_normalization(data, cols_need, freq="5D", min_periods=4*12*60):
    z_score_cols = []
    for col in cols_need:
        rolling_col = data[col].rolling(window=freq, min_periods=min_periods)
        col_lst_mean_std = [my_func(a) for a in rolling_col]
        mu_col = "Mu" + col
        std_col = "Std" + col
        z_score_col = "Zscore" + col
        data[[mu_col, std_col]] = col_lst_mean_std
        data[[mu_col, std_col]] = data[[mu_col, std_col]].index.shift(-1, freq='D')
        data[z_score_col] = (data[col] - data[mu_col]) / data[std_col]
        z_score_cols.append(z_score_col)
    data["ConcatLOB"] = data[z_score_cols].apply(lambda x: np.concatenate(x.values), axis=1)


In [49]:
zscore_normalization(data, col_list_LOB)
data.head().T

In [34]:
import pandas as pd

# create sample DataFrame
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})

# define custom function that returns two values
def my_func(series):
    # compute sum and mean of series
    return series.sum(), 0

# apply custom function to rolling window of size 3 to column A
df_rolling = df['A'].rolling(window=3).apply(lambda x: pd.Series(my_func(x)))

# extract sum column from the resulting DataFrame
df_sum = df_rolling[0]

# print the resulting DataFrame
print(df_sum)

TypeError: cannot convert the series to <class 'float'>

In [None]:
datt groupby by date 
for d in days:
    for i in range(ind_0(d), ind_n(d)-100, step=1):
        sample = X[i: i+100]
        ...