# Stage 09 — Feature Engineering (ASML)
Create momentum / volatility / calendar features based on EDA. Save engineered dataset.


Imports & paths

In [1]:
from __future__ import annotations
import sys
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from dotenv import load_dotenv

# Resolve project root when running from project/notebooks
project_root = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd().resolve()
sys.path.append(str(project_root / "src"))

load_dotenv(project_root / ".env")

from storage import env_paths, read_df, write_df
from features import (
    add_basic_returns, add_momentum_features, add_volatility_features,
    add_calendar_features, add_interaction_features, add_next_day_targets
)

RAW_DIR, PROC_DIR = env_paths(project_root)
def stamp(): return datetime.now().strftime("%Y%m%d-%H%M")
RAW_DIR, PROC_DIR


(WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/raw'),
 WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/processed'))

Load base dataset (prefer CLEANED from Stage 06)

In [2]:
clean_candidates = sorted(PROC_DIR.glob("asml_cleaned_*.parquet")) + sorted(PROC_DIR.glob("asml_cleaned_*.csv"))
raw_candidates   = sorted(RAW_DIR.glob("api_*ASML_*.csv"))

if clean_candidates:
    path = clean_candidates[-1]
    print("Using CLEANED:", path.name)
    df = read_df(path, parse_dates=["date"])
elif raw_candidates:
    path = raw_candidates[-1]
    print("Using RAW:", path.name)
    df = read_df(path, parse_dates=["date"])
else:
    raise FileNotFoundError("No ASML files found. Run Stage 04/06 first.")

df = df.sort_values("date").reset_index(drop=True)
df.head()


Using CLEANED: asml_cleaned_20250818-2218.csv


Unnamed: 0,date,open,high,low,close,adjusted_close,volume
0,2020-08-17,376.660004,380.5,376.25,378.51001,364.91925,428200
1,2020-08-18,383.609985,383.980011,376.98999,377.220001,363.675568,466500
2,2020-08-19,380.390015,380.429993,373.959991,374.940002,361.477417,354400
3,2020-08-20,369.540009,373.380005,368.380005,372.170013,358.806976,498200
4,2020-08-21,366.149994,372.130005,365.540009,372.119995,358.758667,605400


Feature 1: Momentum

### Feature 1 — Momentum (ma_5, ma_21, mom_5, mom_21, rsi_14)
**Rationale:** EDA suggested trends/overextensions; distance from MAs and RSI quantify momentum/mean-reversion pressure.


In [3]:
df1 = add_basic_returns(df)
df1 = add_momentum_features(df1)
df1[["date","adjusted_close","ma_5","ma_21","mom_5","mom_21","rsi_14"]].tail()


Unnamed: 0,date,adjusted_close,ma_5,ma_21,mom_5,mom_21,rsi_14
1251,2025-08-11,721.309998,707.468005,725.39381,0.019566,-0.00563,55.233754
1252,2025-08-12,741.789978,717.9,722.301429,0.033278,0.026981,57.756141
1253,2025-08-13,755.570007,730.821997,719.089524,0.033863,0.050731,59.189821
1254,2025-08-14,755.210022,739.240002,719.125715,0.021603,0.050178,64.420689
1255,2025-08-15,742.159973,743.207996,718.994763,-0.00141,0.032219,54.147066


Feature 2: Volatility

### Feature 2 — Volatility (vol_21, range_21)
**Rationale:** Heavy tails & volatility clustering → rolling std and average intraday range capture regimes that can shift predictability.


In [4]:
df2 = add_volatility_features(df1)
df2[["date","ret","vol_21","range_21"]].tail()


Unnamed: 0,date,ret,vol_21,range_21
1251,2025-08-11,-0.001398,0.025293,0.017591
1252,2025-08-12,0.028393,0.026223,0.017515
1253,2025-08-13,0.018577,0.026152,0.017346
1254,2025-08-14,-0.000476,0.018746,0.016218
1255,2025-08-15,-0.01728,0.018931,0.015942


Feature 3: Calendar + Interaction

### Feature 3 — Calendar time + interactions (dow one-hots; ret × vol)
**Rationale:** Weekday effects and regime interactions can explain structure not visible to simple AR(1).


In [5]:
df3 = add_calendar_features(df2)
df3 = add_interaction_features(df3)
df3.filter(regex="^dow_|ret_x_vol21$").tail()


Unnamed: 0,dow_0,dow_1,dow_2,dow_3,dow_4,ret_x_vol21
1251,1,0,0,0,0,-3.5e-05
1252,0,1,0,0,0,0.000745
1253,0,0,1,0,0,0.000486
1254,0,0,0,1,0,-9e-06
1255,0,0,0,0,1,-0.000327


Targets (next-day)

In [6]:
df4 = add_next_day_targets(df3)
df4[["date","ret","y_next_ret","y_next_up"]].tail()


Unnamed: 0,date,ret,y_next_ret,y_next_up
1251,2025-08-11,-0.001398,0.028393,1
1252,2025-08-12,0.028393,0.018577,1
1253,2025-08-13,0.018577,-0.000476,0
1254,2025-08-14,-0.000476,-0.01728,0
1255,2025-08-15,-0.01728,,0


correlations with target

In [7]:
cols = [c for c in ["mom_5","mom_21","rsi_14","vol_21","range_21","ret","ret_lag1","ret_x_vol21","y_next_ret"] if c in df4.columns]
corr = df4[cols].corr(numeric_only=True)
corr


Unnamed: 0,mom_5,mom_21,rsi_14,vol_21,range_21,ret,ret_lag1,ret_x_vol21,y_next_ret
mom_5,1.0,0.642829,0.464864,0.005693,0.000697,0.72387,0.508605,0.690148,-0.049297
mom_21,0.642829,1.0,0.892655,-0.114111,-0.14966,0.366521,0.326735,0.343922,-0.032767
rsi_14,0.464864,0.892655,1.0,-0.190498,-0.227685,0.240605,0.24123,0.203762,-0.032731
vol_21,0.005693,-0.114111,-0.190498,1.0,0.839067,0.01365,0.012718,0.04567,0.014318
range_21,0.000697,-0.14966,-0.227685,0.839067,1.0,0.016198,0.006415,0.055226,0.022978
ret,0.72387,0.366521,0.240605,0.01365,0.016198,1.0,-0.055776,0.959534,-0.055776
ret_lag1,0.508605,0.326735,0.24123,0.012718,0.006415,-0.055776,1.0,-0.058457,0.025356
ret_x_vol21,0.690148,0.343922,0.203762,0.04567,0.055226,0.959534,-0.058457,1.0,-0.048662
y_next_ret,-0.049297,-0.032767,-0.032731,0.014318,0.022978,-0.055776,0.025356,-0.048662,1.0


Save engineered dataset to project/data/processed/

In [8]:
out_csv  = PROC_DIR / f"asml_features_{stamp()}.csv"
out_parq = PROC_DIR / f"asml_features_{stamp()}.parquet"
write_df(df4, out_csv)
write_df(df4, out_parq)
out_csv, out_parq


(WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/processed/asml_features_20250821-1006.csv'),
 WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/processed/asml_features_20250821-1006.parquet'))