In [113]:
import polars as pl
import mnemosyne as ms
import lz4.frame
import boto3
from datetime import date as Date 
from pprint import pprint
import json
from pathlib import Path
from tqdm.auto import tqdm

In [114]:
def read_hyperliquid_l2book_lz4(symbol_file: Path, symbol: str) -> pl.DataFrame:
    dataframe = {
        'time': [],
        'timestamp': [],
        'is_bid': [],
        'price': [],
        'csize': [],
        'depth': [],
        'num_orders_at_level': [], 
    }

    with open(symbol_file, 'rb') as file:
        data = lz4.frame.decompress(file.read())

    records = []
    for line in data.strip().split(b'\n'):
        if line:  # Skip empty lines
            record = json.loads(line)
            records.append(record)

    for record in records:
        time = record['time']
        ver_num = record['ver_num']
        channel = record['raw']['channel']
        record_symbol = record['raw']['data']['coin']
        timestamp = record['raw']['data']['time']
        levels = record['raw']['data']['levels']
        assert ver_num == 1
        assert len(levels) == 2
        assert record_symbol == symbol 
        assert channel == 'l2Book'
        for k in range(2):
            csize = 0 
            for j, level in enumerate(levels[k]): # bids 
                dataframe['time'].append(time)
                dataframe['timestamp'].append(timestamp)
                dataframe['is_bid'].append(True if k == 0 else False)
                dataframe['price'].append(level['px'])
                csize += float(level['sz'])
                dataframe['depth'].append(j)
                dataframe['csize'].append(csize)
                dataframe['num_orders_at_level'].append(level['n'])
    return pl.DataFrame(dataframe).select(
        pl.col('time').cast(pl.Datetime),
        pl.col('price').cast(pl.Float64),
        pl.col('csize').cast(pl.Float64),
        pl.col('depth').cast(pl.Int16),
        pl.col('num_orders_at_level').cast(pl.Int16),
        pl.from_epoch(pl.col('timestamp'), time_unit='ms').alias('created_time'),
    )

In [115]:
dataset_type = ms.DatasetType.HyperliquidPerpL2
raw_path = Path(dataset_type.raw_data_path(""))
hive_path = Path(dataset_type.hive_path(""))
print(raw_path, hive_path)

/bigdata/mnemosyne/hyperliquid/raw/futures/market_data /bigdata/mnemosyne/hyperliquid/lossless/l2


In [116]:
# Submit this list to Rust. 
date = Date(2025, 9, 30)

date_df = []
for hour in tqdm(range(24)):
    hour_path = raw_path / date.strftime('%Y%m%d') / str(hour) / 'l2Book'
    symbol_files = filter(lambda x: x.is_file and x.suffix == '.lz4', hour_path.glob('*'))
    for symbol_file in symbol_files:
        date_df.append(read_hyperliquid_l2book_lz4(symbol_file, symbol_file.stem))
date_df = pl.concat(date_df)

  0%|          | 0/24 [00:00<?, ?it/s]

In [126]:
date_df.drop('created_time').write_parquet('~/Documents/market.parquet', compression='brotli')