In [1]:
import polars as pl



In [2]:
import numpy as np
from numba import njit
import polars as pl
from hftbacktest import LOCAL_EVENT, EXCH_EVENT

@njit
def generate_order_latency_nb(data, order_latency, mul_entry, offset_entry, mul_resp, offset_resp):
    for i in range(len(data)):
        exch_ts = data[i].exch_ts
        local_ts = data[i].local_ts
        feed_latency = local_ts - exch_ts
        order_entry_latency = mul_entry * feed_latency + offset_entry
        order_resp_latency = mul_resp * feed_latency + offset_resp

        req_ts = local_ts
        order_exch_ts = req_ts + order_entry_latency
        resp_ts = order_exch_ts + order_resp_latency

        order_latency[i].req_ts = req_ts
        order_latency[i].exch_ts = order_exch_ts
        order_latency[i].resp_ts = resp_ts

def generate_order_latency(feed_file, output_file = None, mul_entry = 1, offset_entry = 0, mul_resp = 1, offset_resp = 0):
    data = np.load(feed_file)['data']
    df = pl.DataFrame(data)
    
    df = df.filter(
        (pl.col('ev') & EXCH_EVENT == EXCH_EVENT) & (pl.col('ev') & LOCAL_EVENT == LOCAL_EVENT)
    ).with_columns(
        pl.col('local_ts').alias('ts')
    ).group_by_dynamic(
        'ts', every='1000000000i'
    ).agg(
        pl.col('exch_ts').last(),
        pl.col('local_ts').last()
    ).drop('ts')
    
    data = df.to_numpy(structured=True)

    order_latency = np.zeros(len(data), dtype=[('req_ts', 'i8'), ('exch_ts', 'i8'), ('resp_ts', 'i8'), ('_padding', 'i8')])
    generate_order_latency_nb(data, order_latency, mul_entry, offset_entry, mul_resp, offset_resp)

    if output_file is not None:
        np.savez_compressed(output_file, data=order_latency)

    return order_latency

## Getting started from Tardis.dev data

Few vendors offer tick-by-tick full market depth data along with snapshot and trade data, and Tardis.dev is among them.

<div class="alert alert-info">
    
**Note:** Some data may have an issue with the exchange timestamp. Ideally, the exchange timestamp should reflect the moment the event occurs at the matching engine. However, some data uses the server's data sent timestamp instead of the matching engine timestamp.

</div>

In [3]:
from hftbacktest.data.utils import tardis
import os
import polars as pl
from hftbacktest import EXCH_EVENT, LOCAL_EVENT
import numpy as np

# Define the year and the range of months you want to process
year = 2020
months = range(1, 8)  # From January (01) to December (12)

# Define base URLs for trades and book data with placeholders for year, month, and day
trade_base_url = "https://datasets.tardis.dev/v1/binance-futures/trades/{year}/{month:02d}/01/ETHUSDT.csv.gz"
book_base_url = "https://datasets.tardis.dev/v1/binance-futures/incremental_book_L2/{year}/{month:02d}/01/ETHUSDT.csv.gz"

# Loop through each month
for month in months:
    # Format URLs for the 1st day of the current month
    trade_url = trade_base_url.format(year=year, month=month)
    book_url = book_base_url.format(year=year, month=month)

    # Define the filenames for trade and book data
    trade_filename = f"ETHUSDT_trades_{year}{month:02d}01.csv.gz"
    book_filename = f"ETHUSDT_book_{year}{month:02d}01.csv.gz"

    # Download the trade and book data for the 1st day of the current month
    os.system(f"wget {trade_url} -O {trade_filename}")
    os.system(f"wget {book_url} -O {book_filename}")

    # Convert the downloaded files using tardis
    output_filename = f"ethusdt_{year}{month:02d}01.npz"
    _ = tardis.convert(
        [trade_filename, book_filename],
        output_filename=output_filename,
        buffer_size=200_000_000
    )

    data = np.load(output_filename)['data']
    data
    
    df = pl.DataFrame(data)
    df
    
    df = df.filter((pl.col('ev') & EXCH_EVENT == EXCH_EVENT) & (pl.col('ev') & LOCAL_EVENT == LOCAL_EVENT))
    df = df.with_columns(
        pl.col('local_ts').alias('ts')
    ).group_by_dynamic(
        'ts', every='1000000000i'
    ).agg(
        pl.col('exch_ts').last(),
        pl.col('local_ts').last()
    ).drop('ts')
    data = df.to_numpy(structured=True)
    data
    mul_entry = 4
    offset_entry = 0
    
    mul_resp = 3
    offset_resp = 0
    
    order_latency = np.zeros(len(data), dtype=[('req_ts', 'i8'), ('exch_ts', 'i8'), ('resp_ts', 'i8'), ('_padding', 'i8')])
    for i, (exch_ts, local_ts) in enumerate(data):
        feed_latency = local_ts - exch_ts
        order_entry_latency = mul_entry * feed_latency + offset_entry
        order_resp_latency = mul_resp * feed_latency + offset_resp
    
        req_ts = local_ts
        order_exch_ts = req_ts + order_entry_latency
        resp_ts = order_exch_ts + order_resp_latency
        
        order_latency[i] = (req_ts, order_exch_ts, resp_ts, 0)
        
    order_latency
    df_order_latency = pl.DataFrame(order_latency)
    df_order_latency
    order_entry_latency = df_order_latency['exch_ts'] - df_order_latency['req_ts']
    order_resp_latency = df_order_latency['resp_ts'] - df_order_latency['exch_ts']
    (order_entry_latency <= 0).sum()
    (order_resp_latency <= 0).sum()
    feed_file = f'feed_latency_{year}{month:02d}01.npz'
    order_latency = generate_order_latency(output_filename, output_file=feed_file, mul_entry=4, mul_resp=3)

--2024-09-23 21:49:43--  https://datasets.tardis.dev/v1/binance-futures/trades/2020/01/01/ETHUSDT.csv.gz
Resolving datasets.tardis.dev (datasets.tardis.dev)... 2606:4700:4400::6812:28cd, 2606:4700:4400::ac40:9333, 172.64.147.51, ...
Connecting to datasets.tardis.dev (datasets.tardis.dev)|2606:4700:4400::6812:28cd|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 612298 (598K) [text/csv]
Saving to: ‘ETHUSDT_trades_20200101.csv.gz’

     0K .......... .......... .......... .......... ..........  8% 62.8M 0s
    50K .......... .......... .......... .......... .......... 16%  913K 0s
   100K .......... .......... .......... .......... .......... 25% 10.9M 0s
   150K .......... .......... .......... .......... .......... 33% 2.00M 0s
   200K .......... .......... .......... .......... .......... 41% 5.15M 0s
   250K .......... .......... .......... .......... .......... 50% 10.4M 0s
   300K .......... .......... .......... .......... .......... 58% 8.23M 0s
   350K .

Reading ETHUSDT_trades_20200101.csv.gz
Reading ETHUSDT_book_20200101.csv.gz
Correcting the latency
Correcting the event order
Saving to ethusdt_20200101.npz


FileNotFoundError: [Errno 2] No such file or directory: 'ethusdt_{year}{month:02d}01.npz'

It is recommended to input trade files before depth files. This is because if a depth event occurs due to a trade event, having the trade event before the depth event could provide a more realistic fill during backtesting. However, the sorting process will prioritize events from the first input file when both events have the same timestamp.