# 03 â€” Feature engineering & target construction

Inputs: daily log returns from Notebook 02.  
Output: a single time-aligned dataset with (i) lag/rolling features and (ii) a 1-day-ahead prediction target, ready for modeling in Notebook 04+.

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
log_returns_path = Path("../data/log_returns_2018_2024.parquet")
assert log_returns_path.exists(), f"Missing file: {log_returns_path}"

log_returns = pd.read_parquet(log_returns_path)

log_returns.index = pd.to_datetime(log_returns.index)
log_returns = log_returns.sort_index()

assert isinstance(log_returns.index, pd.DatetimeIndex)
assert log_returns.index.is_monotonic_increasing

In [3]:
if "CL=F" in log_returns.columns:
    log_returns = log_returns.rename(columns={"CL=F": "WTI"})

assert "WTI" in log_returns.columns

In [4]:
assets = ["WTI", "XLE", "ICLN"]
missing = [a for a in assets if a not in log_returns.columns]
assert not missing, f"Missing columns: {missing}"

lag_days = [1, 2, 3, 5, 10]

features = pd.DataFrame(index=log_returns.index)

for col in assets:
    for lag in lag_days:
        features[f"{col}_ret_lag{lag}"] = log_returns[col].shift(lag)

assert features.index.equals(log_returns.index)

In [5]:
rolling_windows = [5, 10, 20]

for col in assets:
    for window in rolling_windows:
        r = log_returns[col].rolling(window=window, min_periods=window)
        features[f"{col}_rollmean_{window}d"] = r.mean()
        features[f"{col}_rollstd_{window}d"] = r.std()

In [6]:
target_assets = ["XLE", "ICLN"]
missing = [a for a in target_assets if a not in log_returns.columns]
assert not missing, f"Missing target columns: {missing}"

target_horizon = 1
targets = log_returns[target_assets].shift(-target_horizon)
targets = targets.rename(columns={a: f"{a}_target" for a in target_assets})

In [7]:
assert features.index.equals(targets.index)

model_df = features.join(targets)

n_before = len(model_df)
model_df = model_df.dropna()
n_after = len(model_df)

assert n_after > 0
assert n_after / n_before > 0.9

In [8]:
model_features_path = Path("../data/model_features_2008_2024.parquet")
model_features_path.parent.mkdir(parents=True, exist_ok=True)

assert len(model_df) > 0

model_df.to_parquet(model_features_path)