# 04 — Modeling: train/test split, models, evaluation

We build predictive models on the engineered dataset using a time-based train/test split. We report out-of-sample performance (RMSE, MAE, R²) for a naive baseline, linear regression, and a random forest.

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

pd.set_option("display.float_format", lambda x: f"{x:.6f}")

In [2]:
features_path = Path("../data/model_features_2018_2024.parquet")
assert features_path.exists()

model_df = pd.read_parquet(features_path)
model_df.index = pd.to_datetime(model_df.index)
model_df = model_df.sort_index()

model_df.head()

Unnamed: 0_level_0,WTI_ret_lag1,WTI_ret_lag2,WTI_ret_lag3,WTI_ret_lag5,WTI_ret_lag10,XLE_ret_lag1,XLE_ret_lag2,XLE_ret_lag3,XLE_ret_lag5,XLE_ret_lag10,...,XLE_rollmean_20d,XLE_rollstd_20d,ICLN_rollmean_5d,ICLN_rollstd_5d,ICLN_rollmean_10d,ICLN_rollstd_10d,ICLN_rollmean_20d,ICLN_rollstd_20d,XLE_target,ICLN_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-31,-0.0163,-0.008808,0.009571,0.017528,0.003759,-0.020379,-0.01521,0.005433,-0.002699,0.008726,...,0.000944,0.011001,0.000209,0.002264,-0.001143,0.003194,0.000578,0.006702,0.010235,-0.005238
2018-02-01,0.00356,-0.0163,-0.008808,-0.001525,-0.000313,0.000668,-0.020379,-0.01521,-0.007882,-0.008203,...,0.000713,0.010739,-0.001465,0.002627,-0.000941,0.002805,0.000422,0.006803,-0.042686,-0.022306
2018-02-02,0.016395,0.00356,-0.0163,0.009571,-0.009111,0.010235,0.000668,-0.020379,0.005433,-0.001309,...,-0.001723,0.014378,-0.005926,0.009491,-0.003379,0.007139,-0.000321,0.008345,-0.04286,-0.031645
2018-02-05,-0.005333,0.016395,0.00356,-0.008808,0.001892,-0.042686,0.010235,0.000668,-0.01521,0.021373,...,-0.003846,0.017057,-0.011629,0.014589,-0.006544,0.011284,-0.002063,0.010837,0.007891,0.00883
2018-02-06,-0.020062,-0.005333,0.016395,-0.0163,0.015318,-0.04286,-0.042686,0.010235,-0.020379,-0.001539,...,-0.00375,0.01712,-0.009863,0.016726,-0.005349,0.012276,-0.002361,0.010423,-0.016863,-0.00551


In [3]:
target_cols = ["XLE_target", "ICLN_target"]
missing_targets = set(target_cols) - set(model_df.columns)
assert not missing_targets, f"Missing target columns: {missing_targets}"

feature_cols = [c for c in model_df.columns if c not in target_cols]

X = model_df[feature_cols].copy()
y = model_df[target_cols].copy()

print("Number of features:", X.shape[1])
print("Target columns:", target_cols)

Number of features: 33
Target columns: ['XLE_target', 'ICLN_target']


In [4]:
split_idx = int(len(X) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print("Train period:", X_train.index.min(), "to", X_train.index.max())
print("Test period:", X_test.index.min(), "to", X_test.index.max())
print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)

Train period: 2018-01-31 00:00:00 to 2023-08-11 00:00:00
Test period: 2023-08-14 00:00:00 to 2024-12-30 00:00:00
Train shape: (1390, 33) | Test shape: (348, 33)


In [5]:
def evaluate_predictions(y_true: pd.DataFrame, y_pred: pd.DataFrame, model_name: str) -> dict:
    targets = y_true.columns.tolist()

    rmse = {t: np.sqrt(mean_squared_error(y_true[t], y_pred[t])) for t in targets}
    mae = {t: mean_absolute_error(y_true[t], y_pred[t]) for t in targets}
    r2 = {t: r2_score(y_true[t], y_pred[t]) for t in targets}

    results = {"model": model_name}
    results.update({f"rmse_{t}": rmse[t] for t in targets})
    results["rmse_avg"] = float(np.mean(list(rmse.values())))
    results.update({f"mae_{t}": mae[t] for t in targets})
    results["mae_avg"] = float(np.mean(list(mae.values())))
    results.update({f"r2_{t}": r2[t] for t in targets})
    results["r2_avg"] = float(np.mean(list(r2.values())))

    return results

In [6]:
y_pred_naive = pd.DataFrame(
    0.0,
    index=y_test.index,
    columns=target_cols,
)

results_naive = evaluate_predictions(y_test, y_pred_naive, model_name="Naive: zero return")
display(pd.Series(results_naive))

model               Naive: zero return
rmse_XLE_target               0.011603
rmse_ICLN_target              0.015938
rmse_avg                      0.013771
mae_XLE_target                0.008957
mae_ICLN_target               0.012119
mae_avg                       0.010538
r2_XLE_target                -0.000002
r2_ICLN_target               -0.004429
r2_avg                       -0.002215
dtype: object

In [7]:
linreg_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("regressor", LinearRegression()),
    ]
)

linreg_pipeline.fit(X_train, y_train)

y_pred_linreg = pd.DataFrame(
    linreg_pipeline.predict(X_test),
    index=y_test.index,
    columns=target_cols,
)

results_linreg = evaluate_predictions(y_test, y_pred_linreg, model_name="Linear Regression")
display(pd.Series(results_linreg))

model               Linear Regression
rmse_XLE_target              0.012148
rmse_ICLN_target             0.016351
rmse_avg                     0.014250
mae_XLE_target               0.009587
mae_ICLN_target              0.012443
mae_avg                      0.011015
r2_XLE_target               -0.096207
r2_ICLN_target              -0.057151
r2_avg                      -0.076679
dtype: object

In [8]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
)

rf_model.fit(X_train, y_train)

y_pred_rf = pd.DataFrame(
    rf_model.predict(X_test),
    index=y_test.index,
    columns=target_cols,
)

results_rf = evaluate_predictions(y_test, y_pred_rf, model_name="Random Forest")
display(pd.Series(results_rf))

model               Random Forest
rmse_XLE_target          0.012710
rmse_ICLN_target         0.016195
rmse_avg                 0.014453
mae_XLE_target           0.009676
mae_ICLN_target          0.012171
mae_avg                  0.010924
r2_XLE_target           -0.199839
r2_ICLN_target          -0.037100
r2_avg                  -0.118469
dtype: object

In [9]:
results_df = pd.DataFrame([results_naive, results_linreg, results_rf])

metric_cols = sorted([c for c in results_df.columns if c != "model"])
results_df = results_df[["model"] + metric_cols]

if "rmse_avg" in results_df.columns:
    results_df = results_df.sort_values("rmse_avg", ascending=True)

display(results_df)

Unnamed: 0,model,mae_ICLN_target,mae_XLE_target,mae_avg,r2_ICLN_target,r2_XLE_target,r2_avg,rmse_ICLN_target,rmse_XLE_target,rmse_avg
0,Naive: zero return,0.012119,0.008957,0.010538,-0.004429,-2e-06,-0.002215,0.015938,0.011603,0.013771
1,Linear Regression,0.012443,0.009587,0.011015,-0.057151,-0.096207,-0.076679,0.016351,0.012148,0.01425
2,Random Forest,0.012171,0.009676,0.010924,-0.0371,-0.199839,-0.118469,0.016195,0.01271,0.014453


In [10]:
from pathlib import Path

outputs_dir = Path("..") / "outputs"
results_dir = outputs_dir / "results"
results_dir.mkdir(parents=True, exist_ok=True)

y_test.to_parquet(results_dir / "y_test_targets.parquet")
y_pred_naive.to_parquet(results_dir / "y_pred_naive.parquet")
y_pred_linreg.to_parquet(results_dir / "y_pred_linreg.parquet")
y_pred_rf.to_parquet(results_dir / "y_pred_rf.parquet")

print(f"Saved predictions to: {results_dir.resolve()}")

Saved predictions to: /files/oil-energy-project/outputs/results
