In [28]:
import mnemosyne as ms
from mnemosyne.engines import ReturnsEngine 
from datetime import datetime as Datetime, date as Date 
import polars as pl
from pathlib import Path
from tqdm.auto import tqdm 
from mnemosyne.dataset import ByDateDataset

In [None]:
dataset_type, backend_grid_interval, peg_symbol = (ms.DatasetType.BinanceSpotTrades, '5s', 'USDC')
returns_engine_kwargs = {}
database_exprs = { # Backend database needs to be time-sorted!!
    'volume': pl.col('volume'),
    'vwap_price': pl.col('vwap_price'), 
    'time': pl.col('last_time'), # timestamp by which to merge in the database
    'symbol': pl.col('symbol') # join_asof partition 
}

returns_query_kwargs = {
    'filter_by_query_dates': True, 
}

universe_dataset = ms.binance.BinanceLastTradesGrid(
        peg_symbol=peg_symbol, 
        grid_interval=backend_grid_interval, 
        dataset_type=dataset_type, 
        parquet_names='*.parquet',
        num_workers=2, 
    )
backend_db = universe_dataset.lazyframe()
returns_engine = ReturnsEngine(backend_db, **returns_engine_kwargs)

returns_interval = '10m'
index_interval = '1d'
start_date = Date(2025, 1, 1)
end_date = Date(2025, 2, 1)

# Filter to keep things manageable for now
backend_universe_df = universe_dataset.universe().filter(pl.col('date') >= Date(2025, 10, 10))
backend_universe_df = universe_dataset.cast_symbol_col_to_enum(backend_universe_df)


In [None]:
relevant_backend_db = backend_db.filter(pl.col('date') >= Date(2025, 10, 8))

# [symbol, date, time] with `returns_interval` interspaced "time" 
returns_query_df = backend_universe_df.with_columns(
        time=pl.datetime_ranges(
            pl.col('date'), 
            pl.col('date').dt.offset_by('1d'),
            interval=returns_interval,
            closed='left'
        )
    ).explode('time')

index_df = backend_universe_df.with_columns(
        time=pl.datetime_ranges(
            pl.col('date'), 
            pl.col('date').dt.offset_by('1d'),
            interval=index_interval,
            closed='left'
        )
    ).explode('time')

index_with_returns = returns_engine.query(
    returns_query_df.lazy(), 
    start_time_expr=pl.col('time'), 
    mark_duration=returns_interval, 
    tick_lag_tolerance=returns_interval,
    append_lag=False, **returns_query_kwargs
).sort('symbol', 'time').collect()

index_with_returns



symbol,date,time,return
enum,date,datetime[μs],f64
"""0G""",2025-10-10,2025-10-10 00:00:00,
"""0G""",2025-10-10,2025-10-10 00:10:00,-0.005006
"""0G""",2025-10-10,2025-10-10 00:20:00,0.004643
"""0G""",2025-10-10,2025-10-10 00:30:00,-0.003466
"""0G""",2025-10-10,2025-10-10 00:40:00,-0.004252
…,…,…,…
"""ZRO""",2025-10-23,2025-10-23 23:10:00,-0.001755
"""ZRO""",2025-10-23,2025-10-23 23:20:00,-0.000586
"""ZRO""",2025-10-23,2025-10-23 23:30:00,-0.000587
"""ZRO""",2025-10-23,2025-10-23 23:40:00,0.002347


In [76]:
def metadata_by_symbol_index():
    """
    This is applied on `index_interval` grids, pre-joining with returns.
    Directly applied to backend_db, grouped by symbol and grid_timestamp
    """
    liquidity = (pl.col('vwap_price') * pl.col('volume')).sum().alias('liquidity')
    yield liquidity
    yield liquidity.pow(0.5).alias('sqrtliq')
    yield ((pl.col('taker_buy_volume') - pl.col('taker_sell_volume')).sum() / pl.col('volume').sum()).alias('excess_buy_ratio')
    # Engineer buy_quote_fill_ratio, sell_quote_fill_ratio
    yield pl.col('trade_count').sum().alias('trade_count')

quantile_expand_metadata = ['liquidity', 'trade_count']

# This is gridded by index_interval
db_with_metadata = relevant_backend_db.with_columns(
    grid_timestamp=pl.col('time').dt.truncate(index_interval).dt.offset_by(index_interval)
).group_by('symbol', 'grid_timestamp').agg(
    [*metadata_by_symbol_index(), database_exprs['time'].last().alias('last_event_time')]
).sort('symbol', 'grid_timestamp').collect()
db_with_metadata

symbol,grid_timestamp,liquidity,sqrtliq,excess_buy_ratio,trade_count,last_event_time
enum,datetime[μs],f64,f64,f64,u64,datetime[μs]
"""0G""",2025-10-09 00:00:00,1.3311e6,1153.73357,-0.0413,13460,2025-10-08 23:59:50
"""0G""",2025-10-10 00:00:00,2.6487e6,1627.492467,-0.033652,16628,2025-10-09 23:59:20
"""0G""",2025-10-11 00:00:00,4.9708e6,2229.540186,-0.02316,29980,2025-10-10 23:59:50
"""0G""",2025-10-12 00:00:00,2.8867e6,1699.024373,-0.012222,14318,2025-10-11 23:59:15
"""0G""",2025-10-13 00:00:00,4.7497e6,2179.39064,0.024975,13897,2025-10-12 23:59:40
…,…,…,…,…,…,…
"""ZRO""",2025-10-20 00:00:00,764441.93692,874.323703,0.020932,5336,2025-10-19 23:55:05
"""ZRO""",2025-10-21 00:00:00,597768.82142,773.155108,0.054337,4840,2025-10-20 23:54:40
"""ZRO""",2025-10-22 00:00:00,1.0173e6,1008.619844,0.038815,7446,2025-10-21 23:59:45
"""ZRO""",2025-10-23 00:00:00,696259.88789,834.421888,0.050314,5330,2025-10-22 23:57:35


In [None]:
# Duplicate over `grid_timestamp`! Granularity is over returns_interval
returns_with_index_grid_timestamp = index_with_returns.join_asof(
    db_with_metadata.select('grid_timestamp', 'symbol'),
    left_on='time',
    right_on='grid_timestamp',
    by='symbol',
    strategy='backward'
).sort('symbol', 'grid_timestamp')

index_gridded_returns=returns_with_index_grid_timestamp.group_by('symbol', 'grid_timestamp').agg(
    returns_drift=pl.col('return').mean(),
    volatility=pl.col('return').std(), 
    vol_ssize=pl.col('return').count()
)

index_grid_metadata_with_vol = index_gridded_returns.join(
    db_with_metadata, 
    on=['symbol', 'grid_timestamp']
)

# Compute things like (drift, volatility, liquidity, excess_buy_ratio, trade_count) quantile
index_grid_metadata_with_vol.group_by('grid_timestamp').agg([
    pl.all(),
    *[
        (pl.col(c).rank('average') / pl.col(c).count()).alias(f'{c}_q')
        for c in quantile_expand_metadata + ['returns_drift', 'volatility']
    ]
]).explode(pl.all().exclude('grid_timestamp'))

  returns_with_index_grid_timestamp = index_with_returns.join_asof(


symbol,grid_timestamp,returns_drift,volatility,vol_ssize,liquidity,sqrtliq,excess_buy_ratio,trade_count,last_event_time
enum,datetime[μs],f64,f64,u64,f64,f64,f64,u64,datetime[μs]
"""0G""",2025-10-10 00:00:00,-0.001066,0.021807,143,2.6487e6,1627.492467,-0.033652,16628,2025-10-09 23:59:20
"""0G""",2025-10-11 00:00:00,0.000131,0.009138,144,4.9708e6,2229.540186,-0.02316,29980,2025-10-10 23:59:50
"""0G""",2025-10-12 00:00:00,0.000361,0.006728,144,2.8867e6,1699.024373,-0.012222,14318,2025-10-11 23:59:15
"""0G""",2025-10-13 00:00:00,0.000218,0.006485,144,4.7497e6,2179.39064,0.024975,13897,2025-10-12 23:59:40
"""0G""",2025-10-14 00:00:00,-0.000464,0.006961,144,3.3068e6,1818.452166,-0.005451,12802,2025-10-13 23:58:55
…,…,…,…,…,…,…,…,…,…
"""ZRO""",2025-10-19 00:00:00,1.6948e-7,0.004103,144,235095.13855,484.866104,-0.058784,2355,2025-10-18 23:56:25
"""ZRO""",2025-10-20 00:00:00,0.000088,0.004275,140,764441.93692,874.323703,0.020932,5336,2025-10-19 23:55:05
"""ZRO""",2025-10-21 00:00:00,-0.000274,0.005819,142,597768.82142,773.155108,0.054337,4840,2025-10-20 23:54:40
"""ZRO""",2025-10-22 00:00:00,0.000006,0.004533,142,1.0173e6,1008.619844,0.038815,7446,2025-10-21 23:59:45
