# 04 — Modeling: train/test split, models, evaluation

We build predictive models on the engineered dataset using a time-based train/test split. We report out-of-sample performance (RMSE, MAE, R²) for a naive baseline, linear regression, and a random forest.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

pd.set_option("display.float_format", lambda x: f"{x:.6f}")

In [None]:
features_path = Path("../data/model_features_2018_2024.parquet")
assert features_path.exists()

model_df = pd.read_parquet(features_path)
model_df.index = pd.to_datetime(model_df.index)
model_df = model_df.sort_index()

model_df.head()

In [None]:
target_cols = ["XLE_target", "ICLN_target"]
missing_targets = set(target_cols) - set(model_df.columns)
assert not missing_targets, f"Missing target columns: {missing_targets}"

feature_cols = [c for c in model_df.columns if c not in target_cols]

X = model_df[feature_cols].copy()
y = model_df[target_cols].copy()

print("Number of features:", X.shape[1])
print("Target columns:", target_cols)

In [None]:
split_idx = int(len(X) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print("Train period:", X_train.index.min(), "to", X_train.index.max())
print("Test period:", X_test.index.min(), "to", X_test.index.max())
print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)

In [None]:
def evaluate_predictions(y_true: pd.DataFrame, y_pred: pd.DataFrame, model_name: str) -> dict:
    targets = y_true.columns.tolist()

    rmse = {t: np.sqrt(mean_squared_error(y_true[t], y_pred[t])) for t in targets}
    mae = {t: mean_absolute_error(y_true[t], y_pred[t]) for t in targets}
    r2 = {t: r2_score(y_true[t], y_pred[t]) for t in targets}

    results = {"model": model_name}
    results.update({f"rmse_{t}": rmse[t] for t in targets})
    results["rmse_avg"] = float(np.mean(list(rmse.values())))
    results.update({f"mae_{t}": mae[t] for t in targets})
    results["mae_avg"] = float(np.mean(list(mae.values())))
    results.update({f"r2_{t}": r2[t] for t in targets})
    results["r2_avg"] = float(np.mean(list(r2.values())))

    return results

In [None]:
y_pred_naive = pd.DataFrame(
    0.0,
    index=y_test.index,
    columns=target_cols,
)

results_naive = evaluate_predictions(y_test, y_pred_naive, model_name="Naive: zero return")
display(pd.Series(results_naive))

In [None]:
linreg_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("regressor", LinearRegression()),
    ]
)

linreg_pipeline.fit(X_train, y_train)

y_pred_linreg = pd.DataFrame(
    linreg_pipeline.predict(X_test),
    index=y_test.index,
    columns=target_cols,
)

results_linreg = evaluate_predictions(y_test, y_pred_linreg, model_name="Linear Regression")
display(pd.Series(results_linreg))

In [None]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
)

rf_model.fit(X_train, y_train)

y_pred_rf = pd.DataFrame(
    rf_model.predict(X_test),
    index=y_test.index,
    columns=target_cols,
)

results_rf = evaluate_predictions(y_test, y_pred_rf, model_name="Random Forest")
display(pd.Series(results_rf))

In [None]:
results_df = pd.DataFrame([results_naive, results_linreg, results_rf])

metric_cols = sorted([c for c in results_df.columns if c != "model"])
results_df = results_df[["model"] + metric_cols]

if "rmse_avg" in results_df.columns:
    results_df = results_df.sort_values("rmse_avg", ascending=True)

display(results_df)

In [None]:
from pathlib import Path

outputs_dir = Path("..") / "outputs"
results_dir = outputs_dir / "results"
results_dir.mkdir(parents=True, exist_ok=True)

y_test.to_parquet(results_dir / "y_test_targets.parquet")
y_pred_naive.to_parquet(results_dir / "y_pred_naive.parquet")
y_pred_linreg.to_parquet(results_dir / "y_pred_linreg.parquet")
y_pred_rf.to_parquet(results_dir / "y_pred_rf.parquet")

print(f"Saved predictions to: {results_dir.resolve()}")