In [29]:
import mnemosyne as ms
from datetime import datetime as Datetime, date as Date
from timedelta_isoformat import timedelta as Timedelta # Import this particular timedelta! Supports proper to-string
import polars as pl
from datetime import date as Date

# Data flow

`lossless` -> `gridded` -> (`returns_engine` and `metadata_engine`)

1. `peg_symbol`: could be `USDC` (much less data) or `USDT`
2. `dataset_type`: futures or spot
2. `backend_grid_interval`: currently cached `5s` and `10m` grids. Granularity from lossless data

All data are recorded from `2022-01-01` onwards to (roughly) today. Let's experiment with `10m` grids

- 5-second gridded data is `~200G`

In [30]:
dataset_type = ms.DatasetType.BinanceUmPerpTrades # ms.DatasetType.BinanceSpotTrades
grid_interval = Timedelta(minutes=10) # Only 5s and 10m are precomputed and cached
peg_symbol = 'USDT' # Good default

# Two dates we'll inspect
sample_dates = [Date(2024, 1, 1), Date(2024, 11, 27)]

All datasets inherit `ByDateDataset`, which support:

1. `compute(recompute: bool)` which computes the dataset in-place. Will try to use cached dataset if `recompute=False`. 
2. `dataset[[date1, date2]]` will return a `LazyFrame` 

In [31]:
# Load the grid_dataset
grid_dataset = ms.binance.BinanceLastTradesGrid(
        peg_symbol=peg_symbol, 
        grid_interval=grid_interval, 
        dataset_type=dataset_type, 
        parquet_names='*.parquet',
        num_workers=2, 
    )

# This will take **long** if it's not already computed
# Computes the dataset in-place and returns nothing
grid_dataset.compute(recompute=False) 

# 1. Common `ByDateDataset` operations

`dataset.universe()` will always return a dataframe (not lazyframe, so no `.collect()`) of the tuple of symbol, dates on which the dataset is defined

In [32]:
grid_dataset.universe()

symbol,date
str,date
"""AGLD""",2024-03-17
"""APE""",2024-03-17
"""REN""",2024-03-17
"""JUP""",2024-03-17
"""AXS""",2024-03-17
…,…
"""DIA""",2025-10-24
"""JST""",2025-10-24
"""ALPINE""",2025-10-24
"""RIVER""",2025-10-24


`dataset[[date1, date2]]` (note that the index is a date list!) will return the slice of dataset on these dates

In [33]:
grid_dataset[sample_dates].collect()

symbol,date,time,peg_symbol,open,high,low,close,volume,trade_count,last_event_time,taker_buy_volume,taker_sell_volume,vwap_taker_buy,vwap_taker_sell,vwap_price
enum,date,datetime[μs],str,f64,f64,f64,f64,f64,u64,datetime[μs],f64,f64,f64,f64,f64
"""MTL""",2024-01-01,2024-01-01 00:10:00,"""USDT""",1.5666,1.573,1.5631,1.5707,99239.0,1714,2024-01-01 00:09:58.233,47067.0,52172.0,1.567926,1.567795,1.567857
"""DUSK""",2024-01-01,2024-01-01 00:10:00,"""USDT""",0.18705,0.18773,0.18662,0.18762,426256.0,1631,2024-01-01 00:09:58.500,196691.0,229565.0,0.187152,0.187159,0.187156
"""1000LUNC""",2024-01-01,2024-01-01 00:10:00,"""USDT""",0.13877,0.13968,0.13858,0.13959,5.101784e6,3846,2024-01-01 00:09:59.248,2.788563e6,2.313221e6,0.139275,0.139338,0.139303
"""YGG""",2024-01-01,2024-01-01 00:10:00,"""USDT""",0.4261,0.4307,0.426,0.4303,470887.0,2051,2024-01-01 00:09:58.587,295372.0,175515.0,0.428078,0.428826,0.428357
"""NKN""",2024-01-01,2024-01-01 00:10:00,"""USDT""",0.11213,0.11284,0.11201,0.11284,483070.0,1337,2024-01-01 00:09:57.907,251653.0,231417.0,0.112396,0.1124,0.112398
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""INJ""",2024-11-27,2024-11-28 00:00:00,"""USDT""",29.815,29.898,29.779,29.8,26810.1,3813,2024-11-27 23:59:59.844,14621.3,12188.8,29.835396,29.821177,29.828932
"""GTC""",2024-11-27,2024-11-28 00:00:00,"""USDT""",0.939,0.94,0.937,0.939,69263.9,313,2024-11-27 23:59:57.189,28952.4,40311.5,0.93948,0.939148,0.939287
"""XVS""",2024-11-27,2024-11-28 00:00:00,"""USDT""",9.885,9.885,9.857,9.88,3026.5,403,2024-11-27 23:59:59.920,1324.7,1701.8,9.869658,9.87046,9.870109
"""COS""",2024-11-27,2024-11-28 00:00:00,"""USDT""",0.011962,0.011985,0.011945,0.011953,7.949783e6,1015,2024-11-27 23:59:58.303,2.923972e6,5.025811e6,0.011966,0.011966,0.011966


`.lazyframe()` returns the full dataset. Collecting on it will instantiate a huge dataframe

As an example, let's compute the number of unique symbols per date. The operation below will take very long on `5s` grid

In [34]:
grid_database = grid_dataset.lazyframe()

num_symbols_per_date = grid_database.group_by('date').agg(
    pl.col('symbol').unique().count().alias('num_unique_symbols')
).collect(engine='streaming').sort('date')

num_symbols_per_date

date,num_unique_symbols
date,u64
2022-01-01,136
2022-01-02,136
2022-01-03,136
2022-01-04,136
2022-01-05,136
…,…
2025-10-22,524
2025-10-23,525
2025-10-24,526
2025-10-25,526


# Returns engine

The returns engine **consumes a gridded dataset** to support returns queries. 

We usually use the `5s` grid so that e.g. `10m` returns will actually make sense (it doesn't make that much sense to compute `10m` returns on `10m` vwap prices)

Define a `query_lf` with `symbol, date, time` columns and execute a query

In [40]:
from mnemosyne.engines import ReturnsEngine 

backend_grid_dataset = ms.binance.BinanceLastTradesGrid(
            peg_symbol='USDT', 
            grid_interval=Timedelta(seconds=5), 
            dataset_type=ms.DatasetType.BinanceSpotTrades)

returns_engine = ReturnsEngine(
    backend_grid_dataset.lazyframe(), # returns-engine will use this dataframe
    # This is the default value: what we want to compute returns over. We can also specify pl.col('open') for open-to-open returns
    backend_fair_expr = pl.col('vwap_price'), 
)

# Next, define a query lazyframe (call .lazy() on dataframe to make it lazy) containing 'symbol' and 'times'. 
#  Here, let's just take 10m grids on 2025, 1, 1
query_lf = backend_grid_dataset[[Date(2025, 1, 1)]].select('symbol', 'date', 'time')
display(query_lf.head().collect())

symbol,date,time
enum,date,datetime[μs]
"""JUP""",2025-01-01,2025-01-01 00:00:05
"""INJ""",2025-01-01,2025-01-01 00:00:05
"""STEEM""",2025-01-01,2025-01-01 00:00:05
"""HIVE""",2025-01-01,2025-01-01 00:00:05
"""REI""",2025-01-01,2025-01-01 00:00:05


### Single-mark query

In [41]:
# returns_engine will append a single `return_column`
df_with_returns = returns_engine.query(
    query_lf, 
    start_time_expr = pl.col('time'), # What we want start time to be
    mark_duration = Timedelta(minutes=10), 
    tick_lag_tolerance = Timedelta(minutes=10), # If we can't find a trade within last 1m, will return NaN
    # Supply additional fields to get more info
    # append_lag = True, 
    # append_query_tick_times = True, 
    # append_start_end_fairs = True
).collect()
df_with_returns



symbol,date,time,max_tick_to_query_lag,return
enum,date,datetime[μs],duration[μs],f64
"""JUP""",2025-01-01,2025-01-01 00:00:05,22s 734798µs,0.003655
"""INJ""",2025-01-01,2025-01-01 00:00:05,38s 480131µs,0.004077
"""STEEM""",2025-01-01,2025-01-01 00:00:05,147539µs,0.039094
"""HIVE""",2025-01-01,2025-01-01 00:00:05,285121µs,-0.025055
"""REI""",2025-01-01,2025-01-01 00:00:05,2m 7s 852805µs,0.005787
…,…,…,…,…
"""BEAMX""",2025-01-01,2025-01-02 00:00:00,12s 120570µs,-0.004826
"""DOT""",2025-01-01,2025-01-02 00:00:00,3s 128505µs,0.00413
"""ZK""",2025-01-01,2025-01-02 00:00:00,3s 382198µs,-0.002047
"""WIN""",2025-01-01,2025-01-02 00:00:00,1s 712479µs,0.002558


In [42]:
other_query_lf = grid_dataset[[Date(2025, 1, 1)]].select('symbol', 'date', 'time')

other_df_with_returns = returns_engine.query(
    other_query_lf, 
    start_time_expr = pl.col('time'), # What we want start time to be
    mark_duration = Timedelta(minutes=10), 
    tick_lag_tolerance = Timedelta(minutes=10), # If we can't find a trade within last 1m, will return NaN
    # Supply additional fields to get more info
    # append_lag = True, 
    # append_query_tick_times = True, 
    # append_start_end_fairs = True
).collect()

other_df_with_returns

symbol,date,time,max_tick_to_query_lag,return
enum,date,datetime[μs],duration[μs],f64
"""MKR""",2025-01-01,2025-01-01 00:10:00,1m 54s 481419µs,0.000669
"""STORJ""",2025-01-01,2025-01-01 00:10:00,12s 635638µs,0.000631
"""ETHFI""",2025-01-01,2025-01-01 00:10:00,10s 819222µs,0.003157
"""ADA""",2025-01-01,2025-01-01 00:10:00,2s 859622µs,0.001846
"""OP""",2025-01-01,2025-01-01 00:10:00,4s 327719µs,0.001705
…,…,…,…,…
"""IOTA""",2025-01-01,2025-01-02 00:00:00,3s 326499µs,0.020597
"""IMX""",2025-01-01,2025-01-02 00:00:00,1m 3s 987844µs,0.001479
"""XAI""",2025-01-01,2025-01-02 00:00:00,3s 250708µs,0.000864
"""AI""",2025-01-01,2025-01-02 00:00:00,23s 967507µs,-0.006245


# Multi-mark query

This is much more efficient than calling `.query` multiple times. When fetching multiple marks at once, define a dictionary:

1. Dictionary key = string name of the returns column
2. Values are two-tuples. The first element defines the start time expression. The second element defines the mark duration. 

In [9]:
mark_exprs = {
    'now_to_p10m': (pl.col('time'), Timedelta(minutes=10)),
    'p1m_to_p11m': (pl.col('time') + Timedelta(minutes=1), Timedelta(minutes=10)),
    'm10m_to_now': (pl.col('time') - Timedelta(minutes=10), Timedelta(minutes=10)),
    'm20m_to_now': (pl.col('time') - Timedelta(minutes=20), Timedelta(minutes=20)),
    'm30m_to_now': (pl.col('time') - Timedelta(minutes=30), Timedelta(minutes=30)),
}


lf_with_multi_returns = returns_engine.query_batch(query_lf, mark_exprs, append_lag=False)
lf_with_multi_returns.collect()

symbol,date,time,return_now_to_p10m,return_p1m_to_p11m,return_m10m_to_now,return_m20m_to_now,return_m30m_to_now
enum,date,datetime[μs],f64,f64,f64,f64,f64
"""INJ""",2025-01-01,2025-01-01 00:00:05,,0.003577,,,
"""BTC""",2025-01-01,2025-01-01 00:00:05,0.000014,-0.001187,0.00028,0.001141,-0.001785
"""SUI""",2025-01-01,2025-01-01 00:00:05,,0.000802,-0.001325,,
"""MANTA""",2025-01-01,2025-01-01 00:00:05,,,-0.000011,,
"""ETH""",2025-01-01,2025-01-01 00:00:05,0.001224,0.000863,-0.000799,0.001092,-0.001756
…,…,…,…,…,…,…,…
"""OM""",2025-01-01,2025-01-02 00:00:00,-0.002452,,,-0.002209,
"""BNB""",2025-01-01,2025-01-02 00:00:00,-0.00126,-0.00075,0.000312,0.000496,0.000628
"""XRP""",2025-01-01,2025-01-02 00:00:00,0.01787,0.021373,0.007401,0.006486,-0.000901
"""PENGU""",2025-01-01,2025-01-02 00:00:00,0.033905,0.02755,-0.006377,-0.005107,-0.005468


# Metadata engine

The metadata engine **consumes a gridded dataset** to support metadata queries. 

**Still experimental**: we only have one set of metadata for now. Only this particular `metadata_engine` can be instantiated

In [16]:
from mnemosyne.engines import MetadataEngine

grid_dataset = ms.binance.BinanceLastTradesGrid(
        peg_symbol='USDT',
        grid_interval=Timedelta(seconds=5),
        dataset_type=ms.DatasetType.BinanceSpotTrades, 
    )

metadata_engine = MetadataEngine(
    path='/data/midas/mnemosyne/notebooks/workflows/dev/metadata', num_workers=1, 
    backend_dataset=grid_dataset, 
)

query_lf = backend_grid_dataset[[Date(2025, 1, 1)]].select('symbol', 'date', 'time')
display(query_lf.head().collect())

symbol,date,time
enum,date,datetime[μs]
"""INJ""",2025-01-01,2025-01-01 00:00:05
"""BTC""",2025-01-01,2025-01-01 00:00:05
"""SUI""",2025-01-01,2025-01-01 00:00:05
"""MANTA""",2025-01-01,2025-01-01 00:00:05
"""ETH""",2025-01-01,2025-01-01 00:00:05


In [19]:
metadata_engine.append_metadata(
    query_lf,
    time_expr=pl.col('time'), 
    symbol_expr=pl.col('symbol')
).collect()

  ).collect()


symbol,date,time,date_right,last_event_time,liquidity_1d,sqrtliq_1d,excess_buy_ratio_1d,trade_count_1d,liquidity_7d,sqrtliq_7d,excess_buy_ratio_7d,trade_count_7d,daily_returns_drift_7d_lookback,daily_vol_7d_lookback,vol_ssize_7d_lookback,daily_returns_drift_30d_lookback,daily_vol_30d_lookback,vol_ssize_30d_lookback,liquidity_1d_q,liquidity_7d_q,daily_vol_7d_lookback_q,daily_vol_30d_lookback_q
enum,date,datetime[μs],date,datetime[μs],f64,f64,f64,u64,f64,f64,f64,u64,f64,f64,u64,f64,f64,u64,f64,f64,f64,f64
"""1000CAT""",2025-01-01,2025-01-01 00:05:50,2024-12-31,2024-12-31 23:59:40.185,1.5255e7,3905.82702,-0.060271,72104,9.6338e7,9815.206139,-0.023955,488894,-0.026503,0.063133,1008,-0.037767,0.099495,2104,0.777778,0.718346,0.77261,0.873385
"""1000CAT""",2025-01-01,2025-01-01 00:09:30,2024-12-31,2024-12-31 23:59:40.185,1.5255e7,3905.82702,-0.060271,72104,9.6338e7,9815.206139,-0.023955,488894,-0.026503,0.063133,1008,-0.037767,0.099495,2104,0.777778,0.718346,0.77261,0.873385
"""1000CAT""",2025-01-01,2025-01-01 00:11:30,2024-12-31,2024-12-31 23:59:40.185,1.5255e7,3905.82702,-0.060271,72104,9.6338e7,9815.206139,-0.023955,488894,-0.026503,0.063133,1008,-0.037767,0.099495,2104,0.777778,0.718346,0.77261,0.873385
"""1000CAT""",2025-01-01,2025-01-01 00:12:35,2024-12-31,2024-12-31 23:59:40.185,1.5255e7,3905.82702,-0.060271,72104,9.6338e7,9815.206139,-0.023955,488894,-0.026503,0.063133,1008,-0.037767,0.099495,2104,0.777778,0.718346,0.77261,0.873385
"""1000CAT""",2025-01-01,2025-01-01 00:13:50,2024-12-31,2024-12-31 23:59:40.185,1.5255e7,3905.82702,-0.060271,72104,9.6338e7,9815.206139,-0.023955,488894,-0.026503,0.063133,1008,-0.037767,0.099495,2104,0.777778,0.718346,0.77261,0.873385
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ZRO""",2025-01-01,2025-01-01 23:35:45,2025-01-01,2025-01-01 22:59:50.021522,6.8494e6,2617.133937,0.033897,43717,7.0170e7,8376.752704,-0.04779,461713,-0.006253,0.044372,1008,-0.008267,0.06806,4320,0.647668,0.670984,0.19171,0.217617
"""ZRO""",2025-01-01,2025-01-01 23:42:00,2025-01-01,2025-01-01 22:59:50.021522,6.8494e6,2617.133937,0.033897,43717,7.0170e7,8376.752704,-0.04779,461713,-0.006253,0.044372,1008,-0.008267,0.06806,4320,0.647668,0.670984,0.19171,0.217617
"""ZRO""",2025-01-01,2025-01-01 23:42:15,2025-01-01,2025-01-01 22:59:50.021522,6.8494e6,2617.133937,0.033897,43717,7.0170e7,8376.752704,-0.04779,461713,-0.006253,0.044372,1008,-0.008267,0.06806,4320,0.647668,0.670984,0.19171,0.217617
"""ZRO""",2025-01-01,2025-01-01 23:44:20,2025-01-01,2025-01-01 22:59:50.021522,6.8494e6,2617.133937,0.033897,43717,7.0170e7,8376.752704,-0.04779,461713,-0.006253,0.044372,1008,-0.008267,0.06806,4320,0.647668,0.670984,0.19171,0.217617


## Lossless raw dataset

Tick-level dataset directly parsed from Binance. We only have parquet-level access. 
One single date has `~100M` rows

In [26]:
pl.read_parquet(grid_dataset.src_path / f'date={sample_dates[0].strftime("%Y-%m-%d")}/**/data.parquet', hive_partitioning=True)

id,price,qty,quote_qty,time,is_buyer_maker,peg_symbol,date,symbol
i64,f64,f64,f64,datetime[μs],bool,str,date,str
107529795,0.013342,394.0,5.256748,2024-01-01 00:00:03.257,false,"""USDT""",2024-01-01,"""1000BONK"""
107529796,0.013342,1138.0,15.183196,2024-01-01 00:00:03.257,false,"""USDT""",2024-01-01,"""1000BONK"""
107529797,0.013342,1018.0,13.582156,2024-01-01 00:00:03.257,false,"""USDT""",2024-01-01,"""1000BONK"""
107529798,0.013342,1450.0,19.3459,2024-01-01 00:00:03.257,false,"""USDT""",2024-01-01,"""1000BONK"""
107529799,0.013341,1533.0,20.451753,2024-01-01 00:00:03.293,true,"""USDT""",2024-01-01,"""1000BONK"""
…,…,…,…,…,…,…,…,…
197509429,0.3743,613.9,229.78277,2024-01-01 23:59:56.932,false,"""USDT""",2024-01-01,"""ZRX"""
197509430,0.3743,613.9,229.78277,2024-01-01 23:59:56.932,false,"""USDT""",2024-01-01,"""ZRX"""
197509431,0.3743,86.1,32.22723,2024-01-01 23:59:56.932,false,"""USDT""",2024-01-01,"""ZRX"""
197509432,0.3744,1162.8,435.35232,2024-01-01 23:59:56.932,false,"""USDT""",2024-01-01,"""ZRX"""
