# Returns engine development

In [1]:
import mnemosyne as ms 
import polars as pl
from datetime import date as Date

peg_symbol = 'USDT'
dstype = ms.DatasetType.BinanceSpotTrades
grid_interval = '10m'

dataset = ms.binance.BinanceLastTradesGrid(
        peg_symbol=peg_symbol, 
        grid_interval=grid_interval, 
        dataset_type=dstype, 
        parquet_names='*.parquet', 
        num_workers=4, 
    )

df = dataset[dataset.partitions[30:50]].collect()
df

symbol,date,time,peg_symbol,open,high,low,close,volume_base,volume_quote,trade_count,last_trade_time,taker_buy_volume_quote,taker_sell_volume_quote,vwap_taker_buy,vwap_taker_sell,vwap_total_by_base
enum,date,datetime[μs],str,f64,f64,f64,f64,f64,f64,u32,datetime[μs],f64,f64,f64,f64,f64
"""ARDR""",2022-01-31,2022-01-31 00:00:00,"""USDT""",0.1749,0.1831,0.1749,0.179,515684.0,92477.4439,459,2022-01-31 00:09:59.319,57200.8171,35276.6268,0.179366,0.179305,0.17933
"""ARPA""",2022-01-31,2022-01-31 00:00:00,"""USDT""",0.06183,0.06192,0.06134,0.06134,151652.8,9333.058485,98,2022-01-31 00:09:32.215,5141.489062,4191.569423,0.061471,0.061631,0.061542
"""BOND""",2022-01-31,2022-01-31 00:00:00,"""USDT""",10.2,10.2,9.97,10.02,3427.05,34399.8332,168,2022-01-31 00:09:56.116,16174.6935,18225.1397,10.039816,10.036063,10.037739
"""BTG""",2022-01-31,2022-01-31 00:00:00,"""USDT""",29.58,29.62,29.46,29.5,37.36,1103.2826,15,2022-01-31 00:09:01.129,583.915,519.3676,29.565412,29.492793,29.531119
"""CAKE""",2022-01-31,2022-01-31 00:00:00,"""USDT""",7.39,7.41,7.32,7.34,19347.78,142541.8449,537,2022-01-31 00:09:49.236,71648.6994,70893.1455,7.369551,7.365282,7.367349
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""WING""",2022-02-19,2022-02-19 23:50:00,"""USDT""",9.37,9.37,9.36,9.36,92.48,865.9888,15,2022-02-19 23:59:47.704,301.714,564.2748,9.37,9.360897,9.364066
"""MBL""",2022-02-19,2022-02-19 23:50:00,"""USDT""",0.005826,0.005829,0.005814,0.005821,891217.0,5187.745615,45,2022-02-19 23:59:57.064,2474.894487,2712.851128,0.005824,0.005819,0.005821
"""PSG""",2022-02-19,2022-02-19 23:50:00,"""USDT""",15.08,15.14,15.07,15.11,241.07,3643.4404,26,2022-02-19 23:55:48.020,3340.8348,302.6056,15.117606,15.07,15.11362
"""HIGH""",2022-02-19,2022-02-19 23:50:00,"""USDT""",4.53,4.57,4.53,4.56,770.237,3501.90544,46,2022-02-19 23:57:40.687,1685.72248,1816.18296,4.55098,4.542443,4.54653


In [2]:
query_lf = df.lazy().select('symbol', 'time', pl.col('last_trade_time').alias('start_time')).drop('time')

start_time = pl.col('last_trade_time')
fair_expr = pl.col('vwap_total_by_base') # Expression from gridded schema from which to compute fair 
mark_expr = pl.lit('10m')
tick_lag_tolerance = pl.lit('30s')

peg_symbol = 'USDC'
grid_backend_dstype = ms.DatasetType.BinanceSpotTrades
grid_backend_interval = '10m'

query_lf.collect()

symbol,start_time
enum,datetime[μs]
"""ARDR""",2022-01-31 00:09:59.319
"""ARPA""",2022-01-31 00:09:32.215
"""BOND""",2022-01-31 00:09:56.116
"""BTG""",2022-01-31 00:09:01.129
"""CAKE""",2022-01-31 00:09:49.236
…,…
"""WING""",2022-02-19 23:59:47.704
"""MBL""",2022-02-19 23:59:57.064
"""PSG""",2022-02-19 23:55:48.020
"""HIGH""",2022-02-19 23:57:40.687


In [74]:
backend_dataset = ms.binance.BinanceLastTradesGrid(
        peg_symbol=peg_symbol, 
        grid_interval=grid_backend_interval, 
        dataset_type=grid_backend_dstype, 
        parquet_names='*.parquet')
db = backend_dataset.lazyframe()
db_symbol_enum = db.collect_schema()['symbol']

In [75]:
# {row_id, symbol, start_time, end_time}
query_with_both = (
    query_lf
    .sort(['symbol', 'start_time'])
    # IMPORTANT: filter to compatible symbols (rest are marked with nans)
    # Then convert from querys' to backends' enum type
    .filter(pl.col('symbol').cast(str).is_in(list(db_symbol_enum.categories)))
    .with_columns(pl.col('symbol').cast(str).cast(db_symbol_enum))
    .with_row_index('row_id')
    .with_columns(
        pl.col('start_time').dt.offset_by(mark_expr).alias('end_time')
    )
    .with_columns([
        pl.col('start_time').set_sorted(),
        pl.col('end_time').set_sorted()
    ])
)
# row_id, symbol, query_time, query_type
query_type_enum = pl.Enum(['start', 'end'])
# Make sure that query has the same symbol-enum type as backend database
long_format = pl.concat([
        query_with_both.select(
            'row_id', 'symbol', 'start_time',
            pl.col('start_time').alias('query_time'),
            pl.lit('start').alias('query_type').cast(query_type_enum)
        ),
        query_with_both.select(
            'row_id', 'symbol', 'start_time',
            pl.col('end_time').alias('query_time'),
            pl.lit('end').alias('query_type').cast(query_type_enum)
        )
    ]).drop('start_time').sort(['symbol', 'query_time'])

query_with_both.collect()

row_id,symbol,start_time,end_time
u32,enum,datetime[μs],datetime[μs]
0,"""AAVE""",2022-01-31 00:09:30.630,2022-01-31 00:19:30.630
1,"""AAVE""",2022-01-31 00:19:58.736,2022-01-31 00:29:58.736
2,"""AAVE""",2022-01-31 00:29:56.692,2022-01-31 00:39:56.692
3,"""AAVE""",2022-01-31 00:39:23.640,2022-01-31 00:49:23.640
4,"""AAVE""",2022-01-31 00:49:58.437,2022-01-31 00:59:58.437
…,…,…,…
290147,"""ZEN""",2022-02-19 23:19:44.638,2022-02-19 23:29:44.638
290148,"""ZEN""",2022-02-19 23:29:24.084,2022-02-19 23:39:24.084
290149,"""ZEN""",2022-02-19 23:39:46.778,2022-02-19 23:49:46.778
290150,"""ZEN""",2022-02-19 23:49:53.570,2022-02-19 23:59:53.570


In [76]:
min_date, max_date = long_format.select(
    pl.col('query_time').dt.date().min().alias('min'),
    pl.col('query_time').dt.date().max().alias('max')
).collect().row(0)

inrange_db = db.filter(
        pl.col('date').is_between(
            min_date, 
            max_date
        )
    ).select(
        'symbol',
        fair_expr.alias('mark_fair'),
        pl.col('last_trade_time').alias('mark_time'), 
    ).sort('symbol', 'mark_time').with_columns(pl.col('mark_time').set_sorted())

In [63]:
merged_results = long_format.join_asof(
    inrange_db,
    left_on='query_time',
    right_on='mark_time',
    by='symbol',
    strategy='backward'
).with_columns(
    (pl.col('query_time') - pl.col('mark_time')).alias('tick_to_query_lag'),
    pl.when(
        pl.col('mark_time').dt.offset_by(tick_lag_tolerance) >= pl.col('query_time')
    ).then(pl.col('mark_fair')).otherwise(None).alias('mark_fair')
).select('row_id', 'symbol', 'query_time', 'query_type', 'tick_to_query_lag', 'mark_fair')

mark_cols = (
    merged_results.group_by(['row_id', 'symbol'])
    .agg([
        pl.col('tick_to_query_lag').max().alias('max_tick_to_query_lag'), # This is maximum over ticks
        # pl.col('query_time').filter(pl.col('query_type') == 'start').first().alias('start_query_time'), 
        # pl.col('query_time').filter(pl.col('query_type') == 'end').first().alias('end_query_time'), 
        # pl.col('mark_fair').filter(pl.col('query_type') == 'start').first().alias('start_fair'), 
        # pl.col('mark_fair').filter(pl.col('query_type') == 'end').first().alias('end_fair'), 
        (
            (pl.col('mark_fair').filter(pl.col('query_type') == 'end').first() - 
            pl.col('mark_fair').filter(pl.col('query_type') == 'start').first()) 
            / pl.col('mark_fair').filter(pl.col('query_type') == 'start').first()
        ).alias('return')
    ])
    .sort('row_id')
).drop('symbol')
return_lf = query_with_both.join(
    mark_cols, on='row_id'
).drop('row_id', 'end_time')
# symbol, start_time, max_tick_to_query_lag, return
return_lf.filter(pl.col('symbol') == 'BTC').collect()

symbol,start_time,max_tick_to_query_lag,return
enum,datetime[μs],duration[μs],f64
"""BTC""",2022-01-31 00:09:59.999,2s 163ms,-0.000698
"""BTC""",2022-01-31 00:19:59.999,26s 281ms,-0.00159
"""BTC""",2022-01-31 00:29:59.999,26s 281ms,-0.002775
"""BTC""",2022-01-31 00:39:59.999,2s 989ms,-0.009482
"""BTC""",2022-01-31 00:49:59.999,2s 844ms,-0.008116
…,…,…,…
"""BTC""",2022-02-19 23:19:59.999,21s 429ms,0.00037
"""BTC""",2022-02-19 23:29:59.999,23s 27ms,-0.000236
"""BTC""",2022-02-19 23:39:59.999,23s 27ms,0.000155
"""BTC""",2022-02-19 23:49:59.999,17s 78ms,0.001244


In [81]:
query_lf.collect()

symbol,start_time
enum,datetime[μs]
"""ARDR""",2022-01-31 00:09:59.319
"""ARPA""",2022-01-31 00:09:32.215
"""BOND""",2022-01-31 00:09:56.116
"""BTG""",2022-01-31 00:09:01.129
"""CAKE""",2022-01-31 00:09:49.236
…,…
"""WING""",2022-02-19 23:59:47.704
"""MBL""",2022-02-19 23:59:57.064
"""PSG""",2022-02-19 23:55:48.020
"""HIGH""",2022-02-19 23:57:40.687


# Example usage

In [3]:
pl.scan_parquet('/data/mnemosyne/binance/grids/spot/last_trade/4s/peg_symbol=USDT/date=*/*.parquet').head(10).collect()

symbol,date,time,peg_symbol,open,high,low,close,volume_base,volume_quote,trade_count,last_trade_time,taker_buy_volume_quote,taker_sell_volume_quote,vwap_taker_buy,vwap_taker_sell,vwap_total_by_base
enum,date,datetime[μs],str,f64,f64,f64,f64,f64,f64,u32,datetime[μs],f64,f64,f64,f64,f64
"""GXS""",2022-01-01,2022-01-01 00:00:00,"""USDT""",1.9461,1.9461,1.945,1.945,3222.0,6267.3141,7,2022-01-01 00:00:01.940,0.0,6267.3141,,1.945163,1.945163
"""LTO""",2022-01-01,2022-01-01 00:00:00,"""USDT""",0.368,0.3682,0.368,0.368,1659.0,610.5198,21,2022-01-01 00:00:03.418,610.5198,0.0,0.368005,,0.368005
"""TLM""",2022-01-01,2022-01-01 00:00:00,"""USDT""",0.2128,0.2128,0.2128,0.2128,8116.0,1727.0848,5,2022-01-01 00:00:03.404,1727.0848,0.0,0.2128,,0.2128
"""VET""",2022-01-01,2022-01-01 00:00:00,"""USDT""",0.08266,0.08271,0.08264,0.08264,34872.9,2882.627338,11,2022-01-01 00:00:03.327,138.387351,2744.239987,0.082708,0.082659,0.082661
"""YFI""",2022-01-01,2022-01-01 00:00:00,"""USDT""",33056.93,33056.93,33056.93,33056.93,0.0005,16.528465,1,2022-01-01 00:00:03.827,16.528465,0.0,33056.93,,33056.93
"""SHIB""",2022-01-01,2022-01-01 00:00:00,"""USDT""",3.3e-05,3.3e-05,3.3e-05,3.3e-05,27248390.0,909.155507,11,2022-01-01 00:00:03.929,497.93506,411.220447,3.3e-05,3.3e-05,3.3e-05
"""STX""",2022-01-01,2022-01-01 00:00:00,"""USDT""",2.167,2.169,2.164,2.164,2223.5,4818.6095,30,2022-01-01 00:00:03.475,2830.4749,1988.1346,2.168116,2.165724,2.167128
"""AVAX""",2022-01-01,2022-01-01 00:00:00,"""USDT""",109.43,109.47,109.42,109.47,55.28,6049.8288,17,2022-01-01 00:00:02.591,5612.1488,437.68,109.441282,109.42,109.43974
"""BNB""",2022-01-01,2022-01-01 00:00:00,"""USDT""",511.5,511.6,511.4,511.6,9.031,4619.4794,17,2022-01-01 00:00:03.545,3710.5537,908.9257,511.518296,511.494486,511.513609
"""GALA""",2022-01-01,2022-01-01 00:00:00,"""USDT""",0.45035,0.45044,0.45035,0.45044,610.0,274.75022,2,2022-01-01 00:00:02.857,0.0,274.75022,,0.45041,0.45041


In [1]:
import mnemosyne as ms
from mnemosyne.engines import ReturnsEngine 
from datetime import datetime as Datetime
import polars as pl

symbol_enum = pl.Enum(['BTC', 'ETH', 'BTCDOWN', 'BNB'])

query_lf = pl.DataFrame({
    'symbol': ['BTC', 'ETH', 'BTCDOWN', 'BNB'], 
    'times': [Datetime(2025, 8, 1, 9, 0, 0)] * 4
}).with_columns(pl.col('symbol').cast(symbol_enum)).lazy()

backend_db = ms.binance.BinanceLastTradesGrid(
            peg_symbol='USDT', 
            grid_interval='10m', 
            dataset_type=ms.DatasetType.BinanceSpotTrades, 
        ).lazyframe()

re = ReturnsEngine(
    backend_db, 
    backend_fair_expr = pl.col('vwap_total_by_base'), 
    backend_time_expr = pl.col('last_trade_time')
)

value = re.query(query_lf, 
    start_time_expr = pl.col('times'), 
    mark_duration=pl.lit('10m'), 
    tick_lag_tolerance=pl.lit('30s')
)
value.collect()

Trying to scan: /data/mnemosyne/binance/grids/spot/last_trade/10m/peg_symbol=USDT/date=*/**/*.parquet




symbol,times,max_tick_to_query_lag,return
enum,datetime[μs],duration[μs],f64
"""BTC""",2025-08-01 09:00:00,154216µs,0.002026
"""ETH""",2025-08-01 09:00:00,677388µs,0.001934
"""BTCDOWN""",2025-08-01 09:00:00,,
"""BNB""",2025-08-01 09:00:00,1s 191853µs,0.001734


# Quick enum test

In [76]:
dfA.write_parquet('/tmp/dfA.pq')

In [77]:
dfA_read = pl.read_parquet('/tmp/dfA.pq')
dfA_read

col,misc
enum,i64
"""b""",1
"""c""",2


In [79]:
dfA_read.with_columns(pl.col('col').cast(enumA))

col,misc
enum,i64
"""b""",1
"""c""",2
