In [102]:
%load_ext autoreload
%autoreload 2

# Quant ETF Features 📈
This notebook shows how to develop features on ETF ohlcv data

## 1. Setup

In [96]:
import os,sys
import duckdb
from pathlib import Path
import pandas as pd
import json

In [101]:
PROJECT_ROOT = Path.cwd().parents[0]

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print(f"Project Root: {PROJECT_ROOT}")

Project Root: C:\Users\luyanda\workspace\QuantTrade


In [106]:
from utils.charts import render_lightweight_chart

In [107]:
from utils.duck import to_bt_daily_duckdb, to_bt_minute_duckdb

In [None]:
from utils.features import add_mas_duckdb, add_rsi_duckdb

SyntaxError: invalid syntax (588680001.py, line 1)

In [45]:
DB_DAILY = PROJECT_ROOT / "data" / "processed" / "dolt" / "stocks.duckdb"
print(f"DB_DAILY: {DB_DAILY}")
con_daily = duckdb.connect(str(DB_DAILY))
tables = [t[0] for t in con_daily.execute("SHOW TABLES").fetchall()]
print("📋 Tables:", tables)

DB_DAILY: C:\Users\luyanda\workspace\QuantTrade\data\processed\dolt\stocks.duckdb
📋 Tables: ['dividend', 'ohlcv', 'split', 'symbol']


In [46]:
DB_MINUTE = PROJECT_ROOT / "data" / "processed" / "alpaca" / "price_minute_alpaca.duckdb"
print(f"DB_MINUTE: {DB_MINUTE}")
con_minute = duckdb.connect(str(DB_MINUTE))
tables = [t[0] for t in con_minute.execute("SHOW TABLES").fetchall()]
print("📋 Tables:", tables)

DB_MINUTE: C:\Users\luyanda\workspace\QuantTrade\data\processed\alpaca\price_minute_alpaca.duckdb
📋 Tables: ['alpaca_minute']


In [47]:
ETFS = ["SPY", "QQQ"]
CHARTS_DIR = PROJECT_ROOT / "charts" 

## 2. Helpers

## 3. Data Ingestion

In [51]:
# --- Ingest latest daily data ---
daily_data = {
    sym: to_bt_daily_duckdb(con_daily, sym, table="ohlcv", date_col="date", symbol_col="act_symbol")
    for sym in ETFS
}

for symbol in ETFS:
    print(daily_data[symbol].tail(1))


              open    high     low   close      volume
datetime                                              
2025-08-12  638.29  642.85  636.79  642.69  64821798.0
              open    high     low   close      volume
datetime                                              
2025-08-12  575.16  580.35  572.49  580.05  42271441.0


In [52]:
# --- Ingest latest minute-level data ---
minute_data = {sym: to_bt_minute_duckdb(con_minute, "alpaca_minute", sym) for sym in ETFS}

for symbol in ETFS:
    print(minute_data[symbol].tail(1))

                       open    high     low   close  volume  trade_count  \
datetime                                                                   
2025-08-13 14:36:00  644.34  644.34  644.34  644.34   100.0          1.0   

                       vwap  
datetime                     
2025-08-13 14:36:00  644.34  
                       open    high     low   close  volume  trade_count  \
datetime                                                                   
2025-08-13 14:55:00  582.45  582.45  582.45  582.45   340.0          2.0   

                       vwap  
datetime                     
2025-08-13 14:55:00  582.45  


## 4. Data Quality Checks

U.S. Market (SPY, QQQ)
Assuming regular NYSE/Nasdaq trading hours:

| **Session**     | **Hours (ET)**   | **Duration** |
| --------------- | ---------------- | ------------ |
| Regular session | 09:30 – 16:00 ET | 6.5 hours    |
|                 |                  | 390 minutes  |

Expect around 390 rows per ETF

In [53]:
for symbol in ETFS:
    df = minute_data[symbol]
    print(f"\n🔍 {symbol}")
    print(f"  • Rows: {len(df)}")
    print(f"  • Date Range: {df.index.min().date()} → {df.index.max().date()}")
    print(f"  • Timezone-aware: {df.index.tz is not None}")
    # print(f"  • Missing 'close': {df['close'].isna().sum()}")

    # --- Drop timezone if needed ---
    df = df.copy()
    if df.index.tz is not None:
        df.index = df.index.tz_localize(None)

    # --- Identify all available intraday dates ---
    df["date"] = df.index.normalize()
    available_dates = df["date"].unique()

    # --- Construct full expected range (business days) ---
    expected_dates = pd.date_range(
        start=df.index.min().normalize(),
        end=df.index.max().normalize(),
        freq='B'
    )

    # --- Missing trading days entirely ---
    missing_dates = sorted(set(expected_dates) - set(available_dates))
    print(f"  • Missing Intraday Dates: {len(missing_dates)}")
    # if missing_dates:
    #     print("    Example:", missing_dates[:5])

    # --- Check for partial trading days (fewer than 390 rows) ---
    counts = df.groupby("date").size()
    partial_days = counts[counts < 390]
    print(f"  • Partial Intraday Days (<390 rows): {len(partial_days)}")
    # if not partial_days.empty:
    #     print("    Example:", partial_days.head())



🔍 SPY
  • Rows: 192320
  • Date Range: 2023-08-09 → 2025-08-13
  • Timezone-aware: False
  • Missing Intraday Dates: 22
  • Partial Intraday Days (<390 rows): 279

🔍 QQQ
  • Rows: 187087
  • Date Range: 2023-08-09 → 2025-08-13
  • Timezone-aware: False
  • Missing Intraday Dates: 22
  • Partial Intraday Days (<390 rows): 306


## 5. Feature Engineering

In [None]:
# # Example: add 20/50/200 SMAs to your daily_data dict using the stocks connection
minute_data_ma = add_mas_duckdb(minute_data, con_minute, windows=[20, 50,200], price_col="close")
print(minute_data_ma["SPY"].tail(1))
print(minute_data_ma["QQQ"].tail(1))

In [62]:
# Minute
minute_data_rsi = add_rsi_duckdb(minute_data, con_minute, period=14)
print(minute_data_rsi["SPY"].tail(1))

                       open    high     low   close  volume  trade_count  \
datetime                                                                   
2025-08-13 14:36:00  644.34  644.34  644.34  644.34   100.0          1.0   

                       vwap      rsi14  
datetime                                
2025-08-13 14:36:00  644.34  82.116788  


In [86]:
# Candles + Volume + MA lines + RSI pane (70/30 guides)
render_lightweight_chart(
    minute_data["SPY"],
    symbol="SPY",
    out_html=CHARTS_DIR/"spy_rsi.html",
    # ma_windows=[20, 50, 200],  # optional; omit if you only want RSI
    rsi_period=14, 
    rsi_bounds=(30, 70),
    timeframes=["1m","5m","15m","1h","1d"], 
    default_tf="5m",
    watermark_text="SPY — {tf}",
)

WindowsPath('C:/Users/luyanda/workspace/QuantTrade/charts/spy_rsi.html')

In [None]:
render_lightweight_chart(
    minute_data["SPY"],
    symbol="SPY",
    out_html=CHARTS_DIR/"spy_20ma50ma200ma.html",
    ma_windows=[20, 50, 200],
    rsi_period=None,            
    rsi_bounds=None,
    timeframes=["1m","5m","15m","1h","1d"],
    default_tf="5m",
    watermark_text="SPY — {tf}",
    watermark_opacity=0.07,
)

WindowsPath('C:/Users/luyanda/workspace/QuantTrade/charts/spy_20ma50ma200ma.html')