In [1]:
import mnemosyne as ms
from mnemosyne.engines import ReturnsEngine, MetadataEngine
from datetime import datetime as Datetime, date as Date
from timedelta_isoformat import timedelta as Timedelta
import polars as pl
from pathlib import Path
from tqdm.auto import tqdm 
from mnemosyne.dataset import ByDateDataset

# Class-based example

In [2]:
backend_dataset = ms.binance.BinanceLastTradesGrid(
        peg_symbol='USDT',
        grid_interval=Timedelta(seconds=5),
        dataset_type=ms.DatasetType.BinanceSpotTrades, 
        parquet_names='*.parquet',
    )

me = MetadataEngine(
    path='/home/nlyu/Code/midas/mnemosyne/notebooks/workflows/dev/metadata', num_workers=1, 
    backend_dataset=backend_dataset, 
)

In [None]:
me.compute(recompute=False, days_per_batch=30)

INFO:mnemosyne.dataset.interface:Computing 1392 partitions in 47 batches (30 days/batch) with 1 workers


  0%|          | 0/47 [00:00<?, ?it/s]

# Debugging dev

In [115]:
def default_metadata(returns_interval: Timedelta):
    liquidity = (pl.col('vwap_price') * pl.col('volume')).sum()
    sqrtliq = liquidity.pow(0.5)
    excess_buy_ratio = ((pl.col('taker_buy_volume') - pl.col('taker_sell_volume')).sum() / pl.col('volume').sum())
    trade_count = pl.col('trade_count').sum()

    num_intervals_in_day = Timedelta(days=1) / returns_interval
    returns_drift = pl.col('return').mean() * num_intervals_in_day
    volatility = pl.col('return').std() * num_intervals_in_day ** .5
    vol_ssize = pl.col('return').count().alias('vol_ssize')
    return {
        # These are calculated 
        'by_symbol_index': {
            Timedelta(days=1): [
                liquidity.alias('liquidity_1d'),
                sqrtliq.alias('sqrtliq_1d'),
                excess_buy_ratio.alias('excess_buy_ratio_1d'), 
                trade_count.alias('trade_count_1d')
            ], 
            Timedelta(days=7): [
                liquidity.alias('liquidity_7d'),
                sqrtliq.alias('sqrtliq_7d'),
                excess_buy_ratio.alias('excess_buy_ratio_7d'), 
                trade_count.alias('trade_count_7d'), 
            ], 
        },
        'accum_returns': {
            Timedelta(days=7): [
                returns_drift.alias('daily_returns_drift_7d_lookback'), 
                volatility.alias('daily_vol_7d_lookback') , 
                vol_ssize.alias('vol_ssize_7d_lookback'),
            ],
            Timedelta(days=30): [
                returns_drift.alias('daily_returns_drift_30d_lookback'), 
                volatility.alias('daily_vol_30d_lookback') , 
                vol_ssize.alias('vol_ssize_30d_lookback'),
            ]
        }
    }

dataset_type, backend_grid_interval, peg_symbol = (ms.DatasetType.BinanceSpotTrades, Timedelta(seconds=5), 'USDC')
backend_dataset = ms.binance.BinanceLastTradesGrid(
        peg_symbol=peg_symbol, 
        grid_interval=backend_grid_interval, 
        dataset_type=dataset_type, 
        parquet_names='*.parquet',
    )

In [134]:
# Arguments and preparation
returns_engine_kwargs = {}
returns_query_kwargs = {
    'filter_by_query_dates': True, 
}

# Class initialization argument (whole dataset)
# Should support initializing a ReturnsEngine
# Supplies both (1) symbol-date universe (via .universe()) and (2) source for metadata computation
backend_dataset = backend_dataset

# Some more arguments. Users should pass these in manually
last_event_time_expr = pl.col('last_event_time')

# returns_interval should ideally partition grid_interval. 
# Vol and drift are ""essentially" for returns over this range
returns_interval = Timedelta(minutes=10)

# This is applied on grid_interval (coarser compared to returns) grids pre joining with returns. 
# Directly applied to backend_dataset's db (match schema there), grouped by symbol and grid_timestamp
# Indexed by rolling window duration over which these are calculated
metadata_exprs = default_metadata(returns_interval)
quantile_expand_exprs = pl.col('^daily_vol.*$', '^liquidity.*$')
dir(returns_engine)

grid_interval = Timedelta(hours=1) # grid_interval **must** be larger than returns_interval! 

In [123]:
# Initialization: prepare for later queries by initializing returns_engine
backend_db = backend_dataset.lazyframe()
returns_engine = ReturnsEngine(backend_db, **returns_engine_kwargs)

# Compute maximum lookbacks for returns & metadata
max_returns_lookback = max(metadata_exprs['accum_returns'].keys())
max_metadata_lookback = max(metadata_exprs['by_symbol_index'].keys())

# Filter laxly around given date ranges: returns & index grids symbol-date pairs are constructed off this dataframe
backend_universe_df = backend_dataset.cast_symbol_col_to_enum(backend_dataset.universe())
returns_grid = backend_universe_df.with_columns(
        returns_grid_time=pl.datetime_ranges(
            pl.col('date'), 
            pl.col('date').dt.offset_by('1d'),
            interval=returns_interval,
            closed='left'
        )
    ).explode('returns_grid_time').sort('symbol', 'returns_grid_time').lazy()

In [124]:
# Now, argument for subset metadata computation
start_date = Date(2025, 1, 1)
end_date = Date(2025, 2, 1) # Inclusive on the left, exclusive on the right

In [125]:
## Step 1: compute returns_interval gridded returns
# [symbol, date, time] with `returns_interval` interspaced "time" 
returns_query = returns_grid.filter(
    (pl.col('date') >= start_date - max_returns_lookback) & 
    (pl.col('date') <= pl.lit(end_date))
).sort('symbol', 'returns_grid_time')

index_with_returns = returns_engine.query(
    returns_query, 
    start_time_expr=pl.col('returns_grid_time'), 
    mark_duration=returns_interval, 
    tick_lag_tolerance=returns_interval,
    append_lag=True, **returns_query_kwargs
).sort('symbol', 'returns_grid_time')

index_with_returns.collect()

symbol,date,returns_grid_time,max_tick_to_query_lag,return
enum,date,datetime[μs],duration[μs],f64
"""1000CAT""",2024-12-19,2024-12-19 00:00:00,,
"""1000CAT""",2024-12-19,2024-12-19 00:10:00,,
"""1000CAT""",2024-12-19,2024-12-19 00:20:00,,
"""1000CAT""",2024-12-19,2024-12-19 00:30:00,,
"""1000CAT""",2024-12-19,2024-12-19 00:40:00,,
…,…,…,…,…
"""ZRO""",2025-02-01,2025-02-01 23:10:00,25s 988620µs,-0.00233
"""ZRO""",2025-02-01,2025-02-01 23:20:00,25s 988620µs,-0.004871
"""ZRO""",2025-02-01,2025-02-01 23:30:00,38s 663478µs,0.00513
"""ZRO""",2025-02-01,2025-02-01 23:40:00,58s 505111µs,0.0003


In [129]:
returns_metadata = pl.concat([
    index_with_returns.select('symbol', 'returns_grid_time')
] + [
    index_with_returns.rolling(
        pl.col('returns_grid_time'), period=interval, closed='left', group_by='symbol'
    ).agg(cols).sort('symbol', 'returns_grid_time').drop('symbol', 'returns_grid_time')
    for interval, cols in metadata_exprs['accum_returns'].items()
], how='horizontal').with_columns(
    # Add grid_interval to make grid_time point-in-time as well
    grid_time = pl.col('returns_grid_time').dt.truncate(grid_interval) + grid_interval
).drop('returns_grid_time').filter(
    # We can filter early here since here's a direct join to the final result
    pl.col('grid_time').is_between(start_date, end_date, closed='left')
).group_by('symbol', 'grid_time').agg(pl.all().last()).sort('symbol', 'grid_time')
returns_metadata.collect()

symbol,grid_time,daily_returns_drift_7d_lookback,daily_vol_7d_lookback,vol_ssize_7d_lookback,daily_returns_drift_30d_lookback,daily_vol_30d_lookback,vol_ssize_30d_lookback
enum,datetime[μs],f64,f64,u64,f64,f64,u64
"""1000CAT""",2025-01-01 00:00:00,-0.024929,0.067075,853,-0.007298,0.102369,1547
"""1000CAT""",2025-01-01 01:00:00,-0.021115,0.066939,856,-0.006412,0.102215,1553
"""1000CAT""",2025-01-01 02:00:00,-0.023647,0.066973,854,-0.00706,0.102167,1557
"""1000CAT""",2025-01-01 03:00:00,-0.027245,0.066641,854,-0.007542,0.102005,1563
"""1000CAT""",2025-01-01 04:00:00,-0.027395,0.066389,849,-0.007754,0.101975,1564
…,…,…,…,…,…,…,…
"""ZRO""",2025-01-31 19:00:00,-0.002194,0.061163,980,-0.008682,0.063703,4141
"""ZRO""",2025-01-31 20:00:00,-0.001208,0.061129,980,-0.008981,0.0637,4141
"""ZRO""",2025-01-31 21:00:00,0.001459,0.061115,980,-0.009111,0.063726,4141
"""ZRO""",2025-01-31 22:00:00,0.000057,0.06118,980,-0.009494,0.063752,4141


In [130]:
# Collect rolling database of metadata. 
# Only collect index-gridded entries
inrange_db = backend_db.filter(
    (pl.col('date') >= start_date - max_metadata_lookback) & 
    (pl.col('date') <= end_date)
).sort('symbol', 'last_event_time')

rolling_metadata = pl.concat([
    inrange_db.select('symbol', last_event_time_expr)
] + [
    inrange_db.rolling(
        last_event_time_expr, period=interval, closed='left', group_by='symbol'
    ).agg(cols).sort('symbol', last_event_time_expr).drop('symbol', last_event_time_expr)
    for interval, cols in metadata_exprs['by_symbol_index'].items()
], how='horizontal').with_columns(
    # Add grid_interval to make grid_time point-in-time as well
    grid_time = last_event_time_expr.dt.truncate(grid_interval) + grid_interval
).filter(
    # We can filter early here since here's a direct join to the final result
    pl.col('grid_time').is_between(start_date, end_date, closed='left')
).group_by('symbol', 'grid_time').agg(pl.all().last()).sort('symbol', 'grid_time')
rolling_metadata.collect()

symbol,grid_time,last_event_time,liquidity_1d,sqrtliq,excess_buy_ratio_1d,trade_count_1d,liquidity_7d,sqrtliq_7d,excess_buy_ratio_7d,trade_count_7d
enum,datetime[μs],datetime[μs],f64,f64,f64,u64,f64,f64,f64,u64
"""1000CAT""",2025-01-01 00:00:00,2024-12-31 23:59:18.049,214760.802636,463.42292,-0.013242,1612,1.2542e6,1119.910153,-0.063638,9013
"""1000CAT""",2025-01-01 01:00:00,2025-01-01 00:59:47.577389,217760.182704,466.647814,-0.007948,1615,1.2526e6,1119.202457,-0.062124,8999
"""1000CAT""",2025-01-01 02:00:00,2025-01-01 01:59:39.017357,217977.38804,466.880486,-0.01082,1617,1.2439e6,1115.300266,-0.065258,8964
"""1000CAT""",2025-01-01 03:00:00,2025-01-01 02:44:41.683038,217640.013302,466.519039,-0.012695,1618,1.2403e6,1113.708652,-0.066678,8955
"""1000CAT""",2025-01-01 04:00:00,2025-01-01 03:59:10.187902,216397.397557,465.185337,-0.016817,1622,1.2357e6,1111.599964,-0.065782,8912
…,…,…,…,…,…,…,…,…,…,…
"""ZRO""",2025-01-31 19:00:00,2025-01-31 18:59:21.951842,656664.31342,810.348267,-0.045236,3604,3.4926e6,1868.862888,0.010385,21958
"""ZRO""",2025-01-31 20:00:00,2025-01-31 19:59:59.483243,661811.90783,813.518228,-0.059075,3639,3.4776e6,1864.844928,0.006648,21868
"""ZRO""",2025-01-31 21:00:00,2025-01-31 20:58:59.996442,674722.98676,821.415234,-0.062523,3710,3.4652e6,1861.508267,0.006254,21763
"""ZRO""",2025-01-31 22:00:00,2025-01-31 21:57:07.919238,670567.31308,818.881745,-0.065919,3708,3.4587e6,1859.756358,0.005963,21719


In [150]:
final_metadata = rolling_metadata.join(
    returns_metadata, 
    on=['symbol', 'grid_time']
).with_columns(
    (
        quantile_expand_exprs.rank('average') / quantile_expand_exprs.count()
    ).name.suffix('_q').over('grid_time')
).sort('symbol', last_event_time_expr)
final_metadata = final_metadata.collect(engine='streaming')

  final_metadata = final_metadata.collect(engine='streaming')


In [151]:
final_metadata

symbol,grid_time,last_event_time,liquidity_1d,sqrtliq,excess_buy_ratio_1d,trade_count_1d,liquidity_7d,sqrtliq_7d,excess_buy_ratio_7d,trade_count_7d,daily_returns_drift_7d_lookback,daily_vol_7d_lookback,vol_ssize_7d_lookback,daily_returns_drift_30d_lookback,daily_vol_30d_lookback,vol_ssize_30d_lookback,liquidity_1d_q,liquidity_7d_q,daily_vol_7d_lookback_q,daily_vol_30d_lookback_q
enum,datetime[μs],datetime[μs],f64,f64,f64,u64,f64,f64,f64,u64,f64,f64,u64,f64,f64,u64,f64,f64,f64,f64
"""1000CAT""",2025-01-01 00:00:00,2024-12-31 23:59:18.049,214760.802636,463.42292,-0.013242,1612,1.2542e6,1119.910153,-0.063638,9013,-0.024929,0.067075,853,-0.007298,0.102369,1547,0.309735,0.212389,0.80531,0.902655
"""1000CAT""",2025-01-01 01:00:00,2025-01-01 00:59:47.577389,217760.182704,466.647814,-0.007948,1615,1.2526e6,1119.202457,-0.062124,8999,-0.021115,0.066939,856,-0.006412,0.102215,1553,0.307018,0.219298,0.798246,0.885965
"""1000CAT""",2025-01-01 02:00:00,2025-01-01 01:59:39.017357,217977.38804,466.880486,-0.01082,1617,1.2439e6,1115.300266,-0.065258,8964,-0.023647,0.066973,854,-0.00706,0.102167,1557,0.307018,0.219298,0.798246,0.903509
"""1000CAT""",2025-01-01 03:00:00,2025-01-01 02:44:41.683038,217640.013302,466.519039,-0.012695,1618,1.2403e6,1113.708652,-0.066678,8955,-0.027245,0.066641,854,-0.007542,0.102005,1563,0.303571,0.214286,0.8125,0.901786
"""1000CAT""",2025-01-01 04:00:00,2025-01-01 03:59:10.187902,216397.397557,465.185337,-0.016817,1622,1.2357e6,1111.599964,-0.065782,8912,-0.027395,0.066389,849,-0.007754,0.101975,1564,0.297297,0.198198,0.810811,0.900901
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ZRO""",2025-01-31 19:00:00,2025-01-31 18:59:21.951842,656664.31342,810.348267,-0.045236,3604,3.4926e6,1868.862888,0.010385,21958,-0.002194,0.061163,980,-0.008682,0.063703,4141,0.491935,0.467742,0.403226,0.403226
"""ZRO""",2025-01-31 20:00:00,2025-01-31 19:59:59.483243,661811.90783,813.518228,-0.059075,3639,3.4776e6,1864.844928,0.006648,21868,-0.001208,0.061129,980,-0.008981,0.0637,4141,0.479675,0.463415,0.406504,0.406504
"""ZRO""",2025-01-31 21:00:00,2025-01-31 20:58:59.996442,674722.98676,821.415234,-0.062523,3710,3.4652e6,1861.508267,0.006254,21763,0.001459,0.061115,980,-0.009111,0.063726,4141,0.479675,0.463415,0.406504,0.406504
"""ZRO""",2025-01-31 22:00:00,2025-01-31 21:57:07.919238,670567.31308,818.881745,-0.065919,3708,3.4587e6,1859.756358,0.005963,21719,0.000057,0.06118,980,-0.009494,0.063752,4141,0.479675,0.463415,0.406504,0.406504


In [152]:
final_metadata.schema

Schema([('symbol',
         Enum(categories=['0G', '1000CAT', '1000CHEEMS', '1000SATS', '1MBABYDOGE', '2Z', 'A', 'A2Z', 'AAVE', 'ACH', 'ACT', 'ACX', 'ADA', 'AEVO', 'AIXBT', 'ALGO', 'ALT', 'ANIME', 'APE', 'API3', 'APT', 'AR', 'ARB', 'ARKM', 'ASTER', 'ATOM', 'AUCTION', 'AUD', 'AVAX', 'AVNT', 'AXS', 'BABY', 'BANANA', 'BANANAS31', 'BARD', 'BB', 'BCH', 'BEAMX', 'BERA', 'BFUSD', 'BIGTIME', 'BIO', 'BLUR', 'BMT', 'BNB', 'BNX', 'BOME', 'BONK', 'BROCCOLI714', 'BTC', 'BTT', 'BTTC', 'C', 'CAKE', 'CATI', 'CETUS', 'CFX', 'CGPT', 'CHESS', 'CHZ', 'CKB', 'COMP', 'COOKIE', 'COW', 'CRV', 'CVC', 'CVX', 'CYBER', 'DF', 'DOGE', 'DOGS', 'DOLO', 'DOT', 'DYDX', 'EDEN', 'EGLD', 'EIGEN', 'ENA', 'ENJ', 'ENS', 'ENSO', 'EOS', 'EPIC', 'ERA', 'ETC', 'ETH', 'ETHFI', 'EUL', 'EUR', 'EURI', 'FDUSD', 'FET', 'FF', 'FIL', 'FLOKI', 'FLUX', 'FORM', 'FRONT', 'FTM', 'FUN', 'GALA', 'GMT', 'GMX', 'GPS', 'GRT', 'GUN', 'HAEDAL', 'HBAR', 'HEI', 'HEMI', 'HIVE', 'HMSTR', 'HOLO', 'HOME', 'HUMA', 'HYPER', 'ICP', 'IDEX', 'ILV', 'IMX', 'IN