In [1]:
import polars as pl
from datetime import date, timedelta
from polars_finance.labels import raw_forward_returns, fixed_time_dynamic_threshold_label, fixed_time_label, triple_barrier_label
from polars_finance.bars import volume_bars
from tqdm.notebook import tqdm
pl.enable_string_cache()

In [2]:
test_df = pl.DataFrame(
    {
    "symbol": ["AAPL"] * 200,
    "ts_event": pl.date_range(date(2021, 1, 1), date(2021, 1, 1) + timedelta(days=199), eager=True),
    "price": range(1, 201)
    },
    schema_overrides={"price": pl.Float64}
)

In [3]:
test_df

symbol,ts_event,price
str,date,f64
"""AAPL""",2021-01-01,1.0
"""AAPL""",2021-01-02,2.0
"""AAPL""",2021-01-03,3.0
"""AAPL""",2021-01-04,4.0
"""AAPL""",2021-01-05,5.0
…,…,…
"""AAPL""",2021-07-15,196.0
"""AAPL""",2021-07-16,197.0
"""AAPL""",2021-07-17,198.0
"""AAPL""",2021-07-18,199.0


In [5]:
test_df.with_columns(
    raw_forward_returns(pl.col("price")).alias("ret"),
    fixed_time_label(pl.col("price"), t=1).alias("label1"),
    fixed_time_label(pl.col("price"), upper_threshold=.2, t=2).alias("label2")
)

symbol,ts_event,price,ret,label1,label2
str,date,f64,f64,i32,i32
"""AAPL""",2021-01-01,1.0,1.0,1,1
"""AAPL""",2021-01-02,2.0,0.5,1,1
"""AAPL""",2021-01-03,3.0,0.333333,1,1
"""AAPL""",2021-01-04,4.0,0.25,1,1
"""AAPL""",2021-01-05,5.0,0.2,1,1
…,…,…,…,…,…
"""AAPL""",2021-07-15,196.0,0.005102,0,0
"""AAPL""",2021-07-16,197.0,0.005076,0,0
"""AAPL""",2021-07-17,198.0,0.005051,0,0
"""AAPL""",2021-07-18,199.0,0.005025,0,0


In [55]:
import databento as db
from pathlib import Path

In [67]:
list(Path("../data/XNAS-20240403-QMLQV3MJHY/").glob("*.zst"))[0].name.split(".")[0]

'xnas-itch-20231018'

In [69]:
for path in tqdm(list(Path("../data/XNAS-20240403-QMLQV3MJHY/").glob("*.zst"))):
    name = path.name.split(".")[0]
    pl.from_pandas(
        db.DBNStore.from_file(path).to_df(),
        schema_overrides={"symbol": pl.Categorical}
    ).select(
        pl.col("ts_event").dt.convert_time_zone("US/Eastern"),
        "price",
        "size",
        "symbol"
    ).write_parquet(f"../data/xnas_2023_2024/{name}.parquet")

  0%|          | 0/250 [00:00<?, ?it/s]

In [2]:
df = pl.scan_parquet("../data/xnas_2023_2024/*.parquet")

In [None]:
volume_bars(df).collect()