In [12]:
import mnemosyne as ms
from mnemosyne.engines import ReturnsEngine 
from datetime import datetime as Datetime, date as Date, timedelta as Timedelta
import polars as pl
from pathlib import Path
from tqdm.auto import tqdm 
from mnemosyne.dataset import ByDateDataset

In [19]:
def default_metadata():
    liquidity = (pl.col('vwap_price') * pl.col('volume')).sum().alias('liquidity')
    sqrtliq = liquidity.pow(0.5).alias('sqrtliq')
    excess_buy_ratio = ((pl.col('taker_buy_volume') - pl.col('taker_sell_volume')).sum() / pl.col('volume').sum()).alias('excess_buy_ratio')
    trade_count = pl.col('trade_count').sum().alias('trade_count')

    vol_sampling_timeframe = Timedelta(days=7)
    num_days = vol_sampling_timeframe / Timedelta(days=1)
    returns_drift = (pl.col('return').mean() / num_days).alias('daily_returns_drift')
    volatility = (pl.col('return').std() / num_days ** .5).alias('daily_vol') 
    vol_ssize = pl.col('return').count().alias('vol_ssize')
    return {
        # These are calculated 
        'by_symbol_index': {
            Timedelta(days=1): [liquidity, sqrtliq, excess_buy_ratio, trade_count], 
        },
        'accum_returns': {
            vol_sampling_timeframe: [returns_drift, volatility, vol_ssize]
        }
    }

dataset_type, backend_grid_interval, peg_symbol = (ms.DatasetType.BinanceSpotTrades, '5s', 'USDC')
backend_dataset = ms.binance.BinanceLastTradesGrid(
        peg_symbol=peg_symbol, 
        grid_interval=backend_grid_interval, 
        dataset_type=dataset_type, 
        parquet_names='*.parquet',
    )

In [20]:
# Arguments and preparation
returns_engine_kwargs = {}
returns_query_kwargs = {
    'filter_by_query_dates': True, 
}

# Class initialization argument (whole dataset)
# Should support initializing a ReturnsEngine
# Supplies both (1) symbol-date universe (via .universe()) and (2) source for metadata computation
backend_dataset = backend_dataset

# Some more arguments. Users should pass these in manually
last_event_time_expr = pl.col('last_event_time')

# This is applied on grid_interval (coarser compared to returns) grids pre joining with returns. 
# Directly applied to backend_dataset's db (match schema there), grouped by symbol and grid_timestamp
# Indexed by rolling window duration over which these are calculated
metadata_exprs = default_metadata()
quantile_expand_metadata_names = ['liquidity', 'trade_count']

# returns_interval should ideally partition grid_interval. 
# Vol and drift are ""essentially" for returns over this range
returns_interval = Timedelta(minutes=10)

# This is the granularity at which metadata are calculated
grid_interval = Timedelta(days=1) # grid_interval **must** be larger than returns_interval! 

In [28]:
returns_metadata_exprs = metadata_exprs['accum_returns']
bysymbol_index_metadata_exprs = metadata_exprs['by_symbol_index']
max_returns_lookback = max(returns_metadata_exprs.keys())

# Initialization: prepare for later queries by initializing returns_engine

backend_db = backend_dataset.lazyframe()
returns_engine = ReturnsEngine(backend_db, **returns_engine_kwargs)

backend_universe_df = backend_dataset.cast_symbol_col_to_enum(backend_dataset.universe())
returns_grid = backend_universe_df.with_columns(
        returns_grid_time=pl.datetime_ranges(
            pl.col('date'), 
            pl.col('date').dt.offset_by('1d'),
            interval=returns_interval,
            closed='left'
        )
    ).explode('returns_grid_time').sort('symbol', 'returns_grid_time').lazy()
returns_grid.collect().shape

(11976048, 3)

In [29]:
# Now, argument for subset metadata computation
start_date = Date(2025, 1, 1)
end_date = Date(2025, 2, 1) # Inclusive on the left, exclusive on the right

In [31]:
returns_query.collect()

symbol,date,returns_grid_time
enum,date,datetime[μs]
"""1000CAT""",2024-12-25,2024-12-25 00:00:00
"""1000CAT""",2024-12-25,2024-12-25 00:10:00
"""1000CAT""",2024-12-25,2024-12-25 00:20:00
"""1000CAT""",2024-12-25,2024-12-25 00:30:00
"""1000CAT""",2024-12-25,2024-12-25 00:40:00
…,…,…
"""ZRO""",2025-02-01,2025-02-01 23:10:00
"""ZRO""",2025-02-01,2025-02-01 23:20:00
"""ZRO""",2025-02-01,2025-02-01 23:30:00
"""ZRO""",2025-02-01,2025-02-01 23:40:00


In [30]:
## Step 1: compute returns_interval gridded returns
# [symbol, date, time] with `returns_interval` interspaced "time" 
returns_query = returns_grid.filter(
    (pl.col('date') >= start_date - max_returns_lookback) & 
    (pl.col('date') <= end_date)
).sort('symbol', 'returns_grid_time')

index_with_returns = returns_engine.query(
    returns_query, 
    start_time_expr=pl.col('returns_grid_time'), 
    mark_duration=returns_interval, 
    tick_lag_tolerance=returns_interval,
    append_lag=True, **returns_query_kwargs
).sort('symbol', 'returns_grid_time')

index_with_returns.collect()

SchemaError: invalid series dtype: expected `String`, got `duration[μs]` for series with name `literal`

In [None]:
## Step2: fetch database metadata
db_with_metadata = backend_db.filter(
    (pl.col('date').dt.offset_by(grid_interval) >= start_date) & 
    (pl.col('date') <= pl.lit(end_date).dt.offset_by(grid_interval))
).with_columns(
    # Offset so that grid_time=t row contains info accessible **at time t**
    grid_time=last_event_time_expr.dt.truncate(grid_interval).dt.offset_by(grid_interval)
).group_by('symbol', 'grid_time').agg(
    [*metadata_exprs_per_symbol_index, last_event_time_expr.last().alias('last_event_time')]
).sort('symbol', 'grid_time')

# Duplicate over `grid_time`! Granularity is over returns_interval
# Join with (coarser) index grid to obtain index grid groups
returns_with_index_grid = index_with_returns.

index_grid_metadata=returns_with_index_grid.group_by('symbol', 'grid_time').agg(
        returns_drift=pl.col('return').mean(),
        volatility=pl.col('return').std(), 
        vol_ssize=pl.col('return').count(), # Number of non-nan samples
    ).join(
        # Join returns-based metadata with metadata we computed previously (symbol-index level)
        # last_event_time comes from db_with_metadata
        db_with_metadata, 
        on=['symbol', 'grid_time']
    ).group_by('grid_time').agg([
        pl.all(),
        *[
            (pl.col(c).rank('average') / pl.col(c).count()).alias(f'{c}_q')
            for c in quantile_expand_metadata_names + ['returns_drift', 'volatility']
        ]
    ]).explode(pl.all().exclude('grid_time')
    ).sort('symbol', 'grid_time').filter(
        pl.col('grid_time').is_between(start_date, end_date, closed='left')
    )

In [25]:
returns_with_index_grid.collect()

  returns_with_index_grid.collect()


symbol,date,returns_grid_time,max_tick_to_query_lag,return,grid_time,liquidity,sqrtliq,excess_buy_ratio,trade_count,last_event_time
enum,date,datetime[μs],duration[μs],f64,datetime[μs],f64,f64,f64,u64,datetime[μs]
"""1000CAT""",2024-12-31,2024-12-31 00:00:00,,,,,,,,
"""1000CAT""",2024-12-31,2024-12-31 00:10:00,,,,,,,,
"""1000CAT""",2024-12-31,2024-12-31 00:20:00,6m 2s 143ms,,,,,,,
"""1000CAT""",2024-12-31,2024-12-31 00:30:00,6m 2s 143ms,-0.00607,,,,,,
"""1000CAT""",2024-12-31,2024-12-31 00:40:00,5m 28s 768ms,0.002826,,,,,,
…,…,…,…,…,…,…,…,…,…,…
"""ZRO""",2025-02-02,2025-02-02 23:10:00,33s 690446µs,0.019807,2025-02-02 00:00:00,316412.07427,562.505177,-0.142262,2302,2025-02-01 23:59:44.202991
"""ZRO""",2025-02-02,2025-02-02 23:20:00,33s 690446µs,-0.022039,2025-02-02 00:00:00,316412.07427,562.505177,-0.142262,2302,2025-02-01 23:59:44.202991
"""ZRO""",2025-02-02,2025-02-02 23:30:00,17s 159931µs,0.014181,2025-02-02 00:00:00,316412.07427,562.505177,-0.142262,2302,2025-02-01 23:59:44.202991
"""ZRO""",2025-02-02,2025-02-02 23:40:00,38s 447202µs,0.010321,2025-02-02 00:00:00,316412.07427,562.505177,-0.142262,2302,2025-02-01 23:59:44.202991


In [24]:
index_grid_metadata.collect()



grid_time,symbol,returns_drift,volatility,vol_ssize,liquidity,sqrtliq,excess_buy_ratio,trade_count,last_event_time,liquidity_q,trade_count_q,returns_drift_q,volatility_q
datetime[μs],enum,f64,f64,u64,f64,f64,f64,u64,datetime[μs],f64,f64,f64,f64
2025-01-01 00:00:00,"""1000CAT""",-0.000248,0.004545,117,215708.3999,464.444184,-0.017691,1614,2024-12-31 23:59:18.049,0.315789,0.377193,0.035088,0.719298
2025-01-02 00:00:00,"""1000CAT""",0.000101,0.003936,131,95842.97356,309.585164,0.016945,1040,2025-01-01 23:52:38.379022,0.122807,0.22807,0.27193,0.54386
2025-01-03 00:00:00,"""1000CAT""",0.000611,0.005214,134,128623.064756,358.640579,0.176573,1220,2025-01-02 23:59:45.891355,0.114035,0.236842,0.77193,0.850877
2025-01-04 00:00:00,"""1000CAT""",-0.000354,0.004723,130,252062.335275,502.0581,-0.01835,1874,2025-01-03 23:58:19.093973,0.245614,0.280702,0.061404,0.877193
2025-01-05 00:00:00,"""1000CAT""",-0.00004,0.004473,121,123504.089262,351.431486,-0.062253,944,2025-01-04 23:57:40.851917,0.175439,0.219298,0.473684,0.868421
…,…,…,…,…,…,…,…,…,…,…,…,…,…
2025-01-27 00:00:00,"""ZRO""",0.000161,0.007195,144,299587.70247,547.346054,-0.029462,2241,2025-01-26 23:59:13.368413,0.467742,0.467742,0.846774,0.395161
2025-01-28 00:00:00,"""ZRO""",-0.000789,0.004959,138,810045.97199,900.02554,-0.087889,5602,2025-01-27 23:54:53.186411,0.475806,0.451613,0.379032,0.556452
2025-01-29 00:00:00,"""ZRO""",0.000229,0.006524,138,555960.14533,745.62735,0.221771,2790,2025-01-28 23:59:55.578232,0.5,0.427419,0.467742,0.443548
2025-01-30 00:00:00,"""ZRO""",0.000432,0.004047,140,375670.84145,612.919931,-0.071833,2620,2025-01-29 23:57:31.351612,0.435484,0.395161,0.677419,0.427419
