<a href="https://colab.research.google.com/github/lydialydia-lydia/b2-tokenized-treasury/blob/main/01_data_build.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook constructs the market data foundation for the B2 risk engine. It obtains daily price series for a U.S. Treasury proxy and BTC, aligns them onto a consistent daily calendar, engineers baseline features (returns, log returns, rolling volatility), and saves standardized processed files for reuse across subsequent notebooks.


In [3]:
# Clone
%cd /content
!rm -rf b2-tokenized-treasury
!git clone https://github.com/lydialydia-lydia/b2-tokenized-treasury.git
%cd /content/b2-tokenized-treasury

!pwd
!ls -la

/content
Cloning into 'b2-tokenized-treasury'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 30 (delta 7), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (30/30), 9.03 KiB | 1.81 MiB/s, done.
Resolving deltas: 100% (7/7), done.
/content/b2-tokenized-treasury
/content/b2-tokenized-treasury
total 32
drwxr-xr-x 4 root root 4096 Jan 20 20:48 .
drwxr-xr-x 1 root root 4096 Jan 20 20:47 ..
drwxr-xr-x 8 root root 4096 Jan 20 20:48 .git
-rw-r--r-- 1 root root 4688 Jan 20 20:48 .gitignore
-rw-r--r-- 1 root root 1073 Jan 20 20:48 LICENSE
-rw-r--r-- 1 root root   23 Jan 20 20:48 README.md
drwxr-xr-x 3 root root 4096 Jan 20 20:48 src


In [4]:
# Install minimal dependencies
!pip -q install yfinance pyarrow

In [5]:
# Imports
import os, json
import numpy as np
import pandas as pd
import yfinance as yf

# Data directories
RAW_DIR = os.path.join(os.getcwd(), "data", "raw")
PROC_DIR = os.path.join(os.getcwd(), "data", "processed")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROC_DIR, exist_ok=True)

print("RAW_DIR:", RAW_DIR)
print("PROC_DIR:", PROC_DIR)

RAW_DIR: /content/b2-tokenized-treasury/data/raw
PROC_DIR: /content/b2-tokenized-treasury/data/processed


In [6]:
# UST proxy examples: BIL (1-3m), SHV (0-1y), SHY (1-3y), IEF (7-10y)
UST_TICKER = "BIL"
BTC_TICKER = "BTC-USD"

START = "2020-01-01"
END = None  # None => latest
FREQ = "D"   # daily calendar
VOL_WINDOW = 30
ANN_FACTOR = 365  # use 252 for TradFi convention

In [7]:
def fetch_adj_close(tickers, start="2020-01-01", end=None):
    """
    Fetch auto-adjusted Close prices from Yahoo Finance.
    Returns a DataFrame with a DatetimeIndex.
    """
    px = yf.download(tickers, start=start, end=end, auto_adjust=True, progress=False)["Close"]
    if isinstance(px, pd.Series):
        px = px.to_frame()
    px.index = pd.to_datetime(px.index)
    return px.sort_index()

prices_raw = fetch_adj_close([UST_TICKER, BTC_TICKER], start=START, end=END)
print("Raw prices shape:", prices_raw.shape)
prices_raw.tail()

Raw prices shape: (2212, 2)


Ticker,BIL,BTC-USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2026-01-16,91.540001,95525.117188
2026-01-17,,95099.921875
2026-01-18,,93634.429688
2026-01-19,,92553.59375
2026-01-20,91.550003,89581.21875


In [8]:
def clean_and_align(prices: pd.DataFrame, freq="D") -> pd.DataFrame:
    """
    Align all series to a daily calendar and forward-fill missing values.
    This is useful because BTC trades daily while ETFs do not.
    """
    prices = prices.copy()
    prices.columns = [c.replace("-USD", "").replace(" ", "_") for c in prices.columns]

    full_idx = pd.date_range(prices.index.min(), prices.index.max(), freq=freq)
    prices = prices.reindex(full_idx).ffill().dropna(how="all")
    return prices

prices = clean_and_align(prices_raw, freq=FREQ)
print("Aligned prices shape:", prices.shape)
prices.tail()

Aligned prices shape: (2212, 2)


Unnamed: 0,BIL,BTC
2026-01-16,91.540001,95525.117188
2026-01-17,91.540001,95099.921875
2026-01-18,91.540001,93634.429688
2026-01-19,91.540001,92553.59375
2026-01-20,91.550003,89581.21875


In [9]:
def add_features(prices: pd.DataFrame, vol_window=30, ann_factor=365) -> pd.DataFrame:
    """
    Create baseline features used across later notebooks:
    - simple returns
    - log returns
    - rolling annualized volatility
    """
    feats = pd.DataFrame(index=prices.index)
    for col in prices.columns:
        feats[f"{col}_ret"] = prices[col].pct_change()
        feats[f"{col}_logret"] = np.log(prices[col]).diff()
        feats[f"{col}_vol_{vol_window}d"] = feats[f"{col}_logret"].rolling(vol_window).std() * np.sqrt(ann_factor)
    return feats

features = add_features(prices, vol_window=VOL_WINDOW, ann_factor=ANN_FACTOR).dropna()
prices_aligned = prices.loc[features.index].copy()

print("Features shape:", features.shape)
features.tail()

Features shape: (2181, 6)


Unnamed: 0,BIL_ret,BIL_logret,BIL_vol_30d,BTC_ret,BTC_logret,BTC_vol_30d
2026-01-16,0.000437,0.000437,0.002694,-0.000273,-0.000273,0.267503
2026-01-17,0.0,0.0,0.002709,-0.004451,-0.004461,0.26591
2026-01-18,0.0,0.0,0.002597,-0.01541,-0.01553,0.25558
2026-01-19,0.0,0.0,0.002597,-0.011543,-0.01161,0.259943
2026-01-20,0.000109,0.000109,0.002579,-0.032115,-0.032642,0.285862


In [10]:
# Save processed parquet files (used by all later notebooks)
prices_path = os.path.join(PROC_DIR, "market_prices.parquet")
feats_path = os.path.join(PROC_DIR, "market_features.parquet")

prices_aligned.to_parquet(prices_path)
features.to_parquet(feats_path)

# Save a small data dictionary for reproducibility
data_dict = {
    "market_prices.parquet": {
        "index": "Daily date index (aligned with forward-fill)",
        "columns": {c: "Aligned daily close price (ffill on non-trading days)" for c in prices_aligned.columns},
    },
    "market_features.parquet": {
        "index": "Daily date index (same as market_prices)",
        "columns": {c: "Engineered feature (return/log return/rolling vol)" for c in features.columns},
    },
    "tickers": {"ust_proxy": UST_TICKER, "btc_proxy": BTC_TICKER},
    "date_range": {"start": str(prices_aligned.index.min().date()), "end": str(prices_aligned.index.max().date())},
    "params": {"freq": FREQ, "vol_window": VOL_WINDOW, "ann_factor": ANN_FACTOR},
    "notes": [
        "ETF/UST proxy is forward-filled over non-trading days to align with BTC daily frequency.",
        "Rolling volatility annualization uses sqrt(365) by default. Use 252 for TradFi convention."
    ],
}

dict_path = os.path.join(PROC_DIR, "data_dictionary.json")
with open(dict_path, "w") as f:
    json.dump(data_dict, f, indent=2)

print("Saved:")
print(" -", prices_path, prices_aligned.shape)
print(" -", feats_path, features.shape)
print(" -", dict_path)

Saved:
 - /content/b2-tokenized-treasury/data/processed/market_prices.parquet (2181, 2)
 - /content/b2-tokenized-treasury/data/processed/market_features.parquet (2181, 6)
 - /content/b2-tokenized-treasury/data/processed/data_dictionary.json


In [11]:
# Verify outputs
!ls -la data/processed

total 176
drwxr-xr-x 2 root root   4096 Jan 20 20:48 .
drwxr-xr-x 4 root root   4096 Jan 20 20:48 ..
-rw-r--r-- 1 root root   1250 Jan 20 20:48 data_dictionary.json
-rw-r--r-- 1 root root 118973 Jan 20 20:48 market_features.parquet
-rw-r--r-- 1 root root  44680 Jan 20 20:48 market_prices.parquet
