# 06 — Time-series validation and robustness checks

This notebook performs a walk-forward (TimeSeriesSplit) validation on the engineered feature set, compares baseline and benchmark models, and summarizes the stability of results (XLE, ICLN). Outputs include fold-level metrics and summary tables saved to disk.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

pd.set_option("display.float_format", lambda x: f"{x:.6f}")

project_root = Path("..")

data_dir = project_root / "data"
outputs_dir = project_root / "outputs"
plots_dir = outputs_dir / "plots"
results_dir = outputs_dir / "results"

r2_timeseries_dir = plots_dir / "r2_timeseries"

plots_dir.mkdir(parents=True, exist_ok=True)
results_dir.mkdir(parents=True, exist_ok=True)
r2_timeseries_dir.mkdir(parents=True, exist_ok=True)

In [None]:
features_path = data_dir / "model_features_2018_2024.parquet"
assert features_path.exists(), f"Missing file: {features_path}"

data = pd.read_parquet(features_path)

print("Dataset shape:", data.shape)
data.head()

In [None]:
target_cols = ["XLE_target", "ICLN_target"]

exclude_cols = set(target_cols)
for c in ["Date", "date", "timestamp"]:
    if c in data.columns:
        exclude_cols.add(c)

feature_cols = [c for c in data.columns if c not in exclude_cols]

X = data[feature_cols].copy()
y = data[target_cols].copy()

# Basic sanity checks
assert all(c in data.columns for c in target_cols), f"Missing targets: {set(target_cols) - set(data.columns)}"
assert len(X) == len(y) == len(data)

# If a date column exists, enforce chronological order
date_col = next((c for c in ["Date", "date", "timestamp"] if c in data.columns), None)
if date_col is not None:
    assert data[date_col].is_monotonic_increasing, f"{date_col} is not sorted increasing"

train_frac = 0.80
train_size = int(len(data) * train_frac)

X_train, y_train = X.iloc[:train_size].copy(), y.iloc[:train_size].copy()
X_test,  y_test  = X.iloc[train_size:].copy(), y.iloc[train_size:].copy()

print("Total observations:", len(data))
print("Train size:", len(X_train))
print("Test size:", len(X_test))

In [None]:
def compute_metrics(y_true, y_pred) -> dict:
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    return {
        "n": int(len(y_true)),
        "RMSE": float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "MAE": float(mean_absolute_error(y_true, y_pred)),
        "R2": float(r2_score(y_true, y_pred)),
    }

In [None]:
n_splits = 5
test_size = len(X_train) // (n_splits + 1)  # stable fold sizes

tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

results = []
targets = target_cols

for split_id, (train_idx, val_idx) in enumerate(tscv.split(X_train), start=1):
    X_tr = X_train.iloc[train_idx]
    X_val = X_train.iloc[val_idx]

    print(f"Split {split_id}: train={len(train_idx)}, val={len(val_idx)}")

    for target in targets:
        y_tr = y_train[target].iloc[train_idx]
        y_val = y_train[target].iloc[val_idx]

        # Naive benchmark: zero return
        y_pred_naive = np.zeros(len(y_val))

        lr_pipeline = Pipeline(
            steps=[
                ("scaler", StandardScaler()),
                ("model", LinearRegression()),
            ]
        )
        lr_pipeline.fit(X_tr, y_tr)
        y_pred_lr = lr_pipeline.predict(X_val)

        rf_model = RandomForestRegressor(
            n_estimators=200,
            random_state=42,
            n_jobs=-1,
        )
        rf_model.fit(X_tr, y_tr)
        y_pred_rf = rf_model.predict(X_val)

        for model_name, y_pred in [
            ("naive", y_pred_naive),
            ("linear_regression", y_pred_lr),
            ("random_forest", y_pred_rf),
        ]:
            metrics = compute_metrics(y_val, y_pred)
            metrics["split"] = split_id
            metrics["target"] = target
            metrics["model"] = model_name
            results.append(metrics)

cv_results = pd.DataFrame(results)[
    ["split", "target", "model", "n", "RMSE", "MAE", "R2"]
]

cv_results.head()

In [None]:
cv_summary = (
    cv_results
    .groupby(["target", "model"], as_index=False)
    .agg(
        mean_n=("n", "mean"),
        mean_RMSE=("RMSE", "mean"),
        std_RMSE=("RMSE", "std"),
        mean_MAE=("MAE", "mean"),
        std_MAE=("MAE", "std"),
        mean_R2=("R2", "mean"),
        std_R2=("R2", "std"),
    )
)

cv_summary

In [None]:
cv_results_path = results_dir / "cv_results_timeseries.csv"
cv_summary_path = results_dir / "cv_summary_timeseries.csv"

cv_results.to_csv(cv_results_path, index=False)
cv_summary.to_csv(cv_summary_path, index=False)

cv_results_path, cv_summary_path

In [None]:
for target in target_cols:
    subset = cv_results[cv_results["target"] == target].sort_values(["model", "split"])

    fig, ax = plt.subplots(figsize=(10, 4))

    for model_name in subset["model"].unique():
        model_data = subset[subset["model"] == model_name].sort_values("split")
        ax.plot(model_data["split"], model_data["R2"], marker="o", label=model_name)

    ax.set_title(f"{target} — R2 across TimeSeriesSplit folds")
    ax.set_xlabel("Split")
    ax.set_ylabel("R2")
    ax.set_xticks(sorted(subset["split"].unique()))
    ax.legend()

    plt.tight_layout()
    save_path = r2_timeseries_dir / f"r2_timeseries_{target}.png"
    plt.savefig(save_path, dpi=300)
    plt.show()
    plt.close()

## Interpretation

We compare fold-level and average out-of-sample metrics (RMSE, MAE, R²) for the naive baseline, Linear Regression, and Random Forest on XLE and ICLN. The main focus is (i) whether any model consistently improves over the baseline, and (ii) how stable performance is across time splits. Results should be interpreted as predictive stability checks on the chosen feature set rather than evidence of structural predictability.