# XGBoost Multi-Output Regression with Nested CV
This notebook trains a global XGBoost model to predict **HI, TMAX, RH** using multi-output regression with **nested cross-validation**, chronological splits, and early stopping.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score

import xgboost as xgb

# Ensure output folders
os.makedirs("models", exist_ok=True)


In [None]:
# Load all station CSVs from merged_datasets folder
folder = "merged_datasets"
all_data = []

for file in os.listdir(folder):
    if file.endswith(".csv"):
        station = file.replace(".csv", "")
        df = pd.read_csv(os.path.join(folder, file))
        df["Station"] = station
        all_data.append(df)

data = pd.concat(all_data, ignore_index=True)

print("Shape:", data.shape)
data.head()


In [None]:
# Define features and targets
target_cols = ["HI", "TMAX", "RH"]
feature_cols = [c for c in data.columns if c not in target_cols + ["Station", "YEAR", "MONTH", "DAY"]]

X = data[feature_cols]
y = data[target_cols]
stations = data["Station"]


In [None]:
# Chronological 80-10-10 split
n = len(data)
train_end = int(n * 0.8)
val_end = int(n * 0.9)

X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
X_val, y_val = X.iloc[train_end:val_end], y.iloc[train_end:val_end]
X_test, y_test = X.iloc[val_end:], y.iloc[val_end:]
stations_val = stations.iloc[train_end:val_end]
stations_test = stations.iloc[val_end:]

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)


In [None]:
# Define base estimator
xgb_est = xgb.XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
    tree_method="hist",
    random_state=42
)

multi_est = MultiOutputRegressor(xgb_est)

# Hyperparameter grid for RandomizedSearchCV
param_grid = {
    "estimator__n_estimators": [200, 500, 1000],
    "estimator__learning_rate": [0.01, 0.05, 0.1],
    "estimator__max_depth": [3, 5, 7],
    "estimator__subsample": [0.7, 0.8, 1.0],
    "estimator__colsample_bytree": [0.7, 0.8, 1.0]
}

# Inner CV
tscv = TimeSeriesSplit(n_splits=3)

search = RandomizedSearchCV(
    estimator=multi_est,
    param_distributions=param_grid,
    n_iter=10,
    scoring="neg_mean_squared_error",
    cv=tscv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

search.fit(X_train, y_train)

print("Best params:", search.best_params_)


In [None]:
best_params = search.best_params_
best_est = xgb.XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
    tree_method="hist",
    random_state=42,
    **{k.replace("estimator__", ""): v for k,v in best_params.items()}
)

final_model = MultiOutputRegressor(best_est)

# Use validation set for early stopping
final_model.fit(
    X_train, y_train,
    **{"estimator__eval_set":[(X_val, y_val)], "estimator__early_stopping_rounds":20, "estimator__verbose":False}
)

# Save model
final_model.estimators_[0].save_model("models/xgb_model.json")


In [None]:
def evaluate_predictions(y_true, y_pred, stations_subset):
    results = []
    for st in stations_subset.unique():
        idx = stations_subset == st
        rmse = np.sqrt(mean_squared_error(y_true[idx], y_pred[idx]))
        r2 = r2_score(y_true[idx], y_pred[idx])
        results.append({"Station": st, "RMSE": rmse, "R2": r2})
    return pd.DataFrame(results)

# Validation results
val_pred = final_model.predict(X_val)
df_val_metrics = evaluate_predictions(y_val, val_pred, stations_val)

# Test results
test_pred = final_model.predict(X_test)
df_test_metrics = evaluate_predictions(y_test, test_pred, stations_test)

print("Validation Metrics:")
display(df_val_metrics)

print("Test Metrics:")
display(df_test_metrics)


In [None]:
# Plot actual vs predicted per station (test set)
for st in stations_test.unique():
    idx = stations_test == st
    plt.figure(figsize=(10,4))
    plt.plot(y_test[idx].values[:,0], label="Actual HI")
    plt.plot(test_pred[idx][:,0], label="Pred HI")
    plt.title(f"Station {st} - HI Prediction (Test)")
    plt.legend()
    plt.show()
