# Notebook 05 — Erweiterte Modellierung mit HistGBR (Train-Tuning → Hold-out Evaluation)

Dieses Notebook deckt folgende Anforderungen ab:
- **Anforderung 6:** Modellauswahl und Hyperparameter
- **Anforderung 7:** Training
- **Anforderung 8:** Evaluation und Ergebnisse (Hold-out)

Vorgehen:
1. Definition eines erweiterten Modellansatzes (HistGradientBoostingRegressor)
2. Mini-Tuning der Hyperparameter auf dem Trainingssplit (kleines Raster)
3. Training mit der gewählten Konfiguration
4. Evaluation auf dem Hold-out Datensatz und Vergleich zur Baseline aus Notebook 04

No-Leak Regel:
- `elapsed_time` wird nicht als Feature genutzt.

Outputs:
- `data_derived/05_histgbr_mini_tuning_results.csv`
- `data_derived/05_histgbr_selected_params.csv`
- `data_derived/05_holdout_metrics_histgbr.csv`
- `data_derived/05_holdout_predictions_histgbr.csv`
- `data_derived/05_feature_set_histgbr.csv`
- `data_derived/05_holdout_comparison_baseline_vs_histgbr.csv`

In [1]:
from __future__ import annotations

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

SEED = 42
np.random.seed(SEED)

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "data" / "processed").exists():
            return p
    return start

REPO_ROOT = find_repo_root(Path.cwd())

PATH_MODEL_READY = REPO_ROOT / "data" / "processed" / "model_ready.csv"
PATH_DERIVED = REPO_ROOT / "data_derived"
PATH_DERIVED.mkdir(parents=True, exist_ok=True)

TARGET = "moving_time"

print("REPO_ROOT:", REPO_ROOT)
print("PATH_MODEL_READY:", PATH_MODEL_READY)

REPO_ROOT: /Users/justuspfeifer/Documents/AML/aml-justus-pfeifer
PATH_MODEL_READY: /Users/justuspfeifer/Documents/AML/aml-justus-pfeifer/data/processed/model_ready.csv


In [2]:
if not PATH_MODEL_READY.exists():
    raise FileNotFoundError(
        "[ERROR] model_ready.csv nicht gefunden.\n"
        f"Erwarteter Pfad: {PATH_MODEL_READY}\n"
        "Bitte Notebook 02 ausführen."
    )

df = pd.read_csv(PATH_MODEL_READY)
print("model_ready geladen:", df.shape)

FEATURES_HGBR = [
    "distance",
    "total_elevation_gain",
    "highest_elevation",
    "lowest_elevation",
    "distance_km",
    "elev_gain_per_km",
    "elev_range",
]

missing = [c for c in FEATURES_HGBR + [TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"[ERROR] Erwartete Spalten fehlen in model_ready.csv: {missing}")

X = df[FEATURES_HGBR].copy()
y = df[TARGET].astype(float).copy()

# Hold-out Split (identisch reproduzierbar wie Notebook 04)
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y,
    test_size=0.20,
    random_state=SEED,
)

print("Train:", X_train.shape, "Hold-out:", X_holdout.shape)

# Export: Feature Set
feat_path = PATH_DERIVED / "05_feature_set_histgbr.csv"
pd.DataFrame({"feature": FEATURES_HGBR}).to_csv(feat_path, index=False)
print("Saved:", feat_path)

model_ready geladen: (9236, 9)
Train: (7388, 7) Hold-out: (1848, 7)
Saved: /Users/justuspfeifer/Documents/AML/aml-justus-pfeifer/data_derived/05_feature_set_histgbr.csv


## Mini-Tuning auf dem Trainingssplit

Die Hyperparameter werden mit einem kleinen Raster auf dem Trainingssplit optimiert.
Die Auswahl erfolgt über K-Fold Cross-Validation auf `X_train/y_train` anhand des MAE (Sekunden).

Der Hold-out Datensatz bleibt bis zur finalen Evaluation unangetastet.

In [4]:
from itertools import product

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

grid = {
    "max_depth": [4, 6, 8],
    "learning_rate": [0.03, 0.05, 0.08],
    "max_iter": [300, 600],
}

rows = []
for max_depth, learning_rate, max_iter in product(
    grid["max_depth"], grid["learning_rate"], grid["max_iter"]
):
    params = {
        "max_depth": max_depth,
        "learning_rate": learning_rate,
        "max_iter": max_iter,
        "random_state": SEED,
    }

    fold_mae = []
    fold_rmse = []
    fold_r2 = []

    for train_idx, val_idx in kf.split(X_train):
        X_tr = X_train.iloc[train_idx]
        y_tr = y_train.iloc[train_idx]
        X_va = X_train.iloc[val_idx]
        y_va = y_train.iloc[val_idx]

        m = HistGradientBoostingRegressor(**params)
        m.fit(X_tr, y_tr)
        pred = m.predict(X_va)

        mae = mean_absolute_error(y_va, pred)
        mse = mean_squared_error(y_va, pred)
        rmse = float(np.sqrt(mse))
        r2 = r2_score(y_va, pred)

        fold_mae.append(mae)
        fold_rmse.append(rmse)
        fold_r2.append(r2)

    rows.append({
        "max_depth": max_depth,
        "learning_rate": learning_rate,
        "max_iter": max_iter,
        "mae_cv_mean_s": float(np.mean(fold_mae)),
        "mae_cv_std_s": float(np.std(fold_mae, ddof=1)),
        "rmse_cv_mean_s": float(np.mean(fold_rmse)),
        "rmse_cv_std_s": float(np.std(fold_rmse, ddof=1)),
        "r2_cv_mean": float(np.mean(fold_r2)),
        "r2_cv_std": float(np.std(fold_r2, ddof=1)),
    })

tuning_results = pd.DataFrame(rows).sort_values("mae_cv_mean_s", ascending=True).reset_index(drop=True)
display(tuning_results)

tuning_path = PATH_DERIVED / "05_histgbr_mini_tuning_results.csv"
tuning_results.to_csv(tuning_path, index=False)
print("Saved:", tuning_path)

best_params = tuning_results.iloc[0][["max_depth", "learning_rate", "max_iter"]].to_dict()
print("Best params (by CV MAE on train):", best_params)


Unnamed: 0,max_depth,learning_rate,max_iter,mae_cv_mean_s,mae_cv_std_s,rmse_cv_mean_s,rmse_cv_std_s,r2_cv_mean,r2_cv_std
0,8,0.05,300,639.558926,46.989724,1419.010806,307.407048,0.921658,0.025168
1,8,0.03,600,640.251409,48.638288,1429.242077,316.54567,0.920463,0.026168
2,6,0.05,300,640.937995,51.111273,1420.877677,322.56818,0.92131,0.027034
3,6,0.03,600,640.9656,50.079153,1430.939191,317.025136,0.920259,0.026281
4,8,0.03,300,641.066552,44.247719,1397.481556,308.575771,0.923933,0.025754
5,6,0.03,300,643.423255,48.328687,1403.859508,313.292027,0.923194,0.026374
6,6,0.08,300,643.952718,54.097628,1460.093514,334.493283,0.916808,0.028319
7,8,0.08,300,644.772263,54.451021,1465.46398,331.244305,0.916256,0.028041
8,6,0.05,600,646.350362,57.080555,1487.290823,352.19935,0.913497,0.030775
9,8,0.05,600,647.122884,56.296266,1490.191613,343.0617,0.913244,0.029904


Saved: /Users/justuspfeifer/Documents/AML/aml-justus-pfeifer/data_derived/05_histgbr_mini_tuning_results.csv
Best params (by CV MAE on train): {'max_depth': 8.0, 'learning_rate': 0.05, 'max_iter': 300.0}


In [5]:
selected_params = {
    "model": "HistGradientBoostingRegressor",
    "selection_method": "5-fold CV on train, minimize MAE (seconds)",
    "max_depth": int(best_params["max_depth"]),
    "learning_rate": float(best_params["learning_rate"]),
    "max_iter": int(best_params["max_iter"]),
    "random_state": SEED,
}

selected_path = PATH_DERIVED / "05_histgbr_selected_params.csv"
pd.DataFrame([selected_params]).to_csv(selected_path, index=False)
print("Saved:", selected_path)

display(pd.DataFrame([selected_params]))

Saved: /Users/justuspfeifer/Documents/AML/aml-justus-pfeifer/data_derived/05_histgbr_selected_params.csv


Unnamed: 0,model,selection_method,max_depth,learning_rate,max_iter,random_state
0,HistGradientBoostingRegressor,"5-fold CV on train, minimize MAE (seconds)",8,0.05,300,42


## Training mit gewählter Konfiguration und Hold-out Evaluation

Das Modell wird mit der gewählten Konfiguration auf dem gesamten Trainingssplit trainiert.
Anschließend erfolgt die Evaluation auf dem Hold-out Datensatz und der Vergleich zur Baseline aus Notebook 04.

In [6]:
HGBR_PARAMS = {
    "max_depth": int(best_params["max_depth"]),
    "learning_rate": float(best_params["learning_rate"]),
    "max_iter": int(best_params["max_iter"]),
    "random_state": SEED,
}

model = HistGradientBoostingRegressor(**HGBR_PARAMS)
model.fit(X_train, y_train)
y_pred = model.predict(X_holdout)

print("HistGBR trainiert und Hold-out Predictions erzeugt:", y_pred.shape)
print("Params:", HGBR_PARAMS)

HistGBR trainiert und Hold-out Predictions erzeugt: (1848,)
Params: {'max_depth': 8, 'learning_rate': 0.05, 'max_iter': 300, 'random_state': 42}


In [9]:
# Hold-out Metriken (versionsrobust)

mae = mean_absolute_error(y_holdout, y_pred)
mse = mean_squared_error(y_holdout, y_pred)
rmse = float(np.sqrt(mse))
r2 = r2_score(y_holdout, y_pred)

metrics_histgbr = pd.DataFrame([{
    "block": "Hold-out – HistGBR (FE, train-tuned)",
    "model": f"HistGBR depth={HGBR_PARAMS['max_depth']} lr={HGBR_PARAMS['learning_rate']} it={HGBR_PARAMS['max_iter']}",
    "mae_s": float(mae),
    "rmse_s": float(rmse),
    "r2": float(r2),
    "mae_min": float(mae / 60.0),
    "rmse_min": float(rmse / 60.0),
}])

display(metrics_histgbr)

# Export Metrics
metrics_path = PATH_DERIVED / "05_holdout_metrics_histgbr.csv"
metrics_histgbr.to_csv(metrics_path, index=False)
print("Saved:", metrics_path)

# Export Predictions
pred_df = X_holdout.copy()
pred_df["y_true"] = y_holdout.values
pred_df["y_pred"] = y_pred
pred_df["abs_error"] = (pred_df["y_true"] - pred_df["y_pred"]).abs()

pred_path = PATH_DERIVED / "05_holdout_predictions_histgbr.csv"
pred_df.to_csv(pred_path, index=False)
print("Saved:", pred_path)
print("pred_df shape:", pred_df.shape)

# Vergleich zur Baseline
baseline_path = PATH_DERIVED / "04_holdout_metrics_no_leak.csv"
if not baseline_path.exists():
    print("[WARN] Baseline-Metriken nicht gefunden:", baseline_path)
else:
    base = pd.read_csv(baseline_path)
    display(base)

    mae_base = float(base.loc[0, "mae_s"])
    delta = mae_base - float(mae)

    comparison = pd.DataFrame([{
        "baseline_model": str(base.loc[0, "model"]),
        "baseline_mae_s": mae_base,
        "histgbr_model": str(metrics_histgbr.loc[0, "model"]),
        "histgbr_mae_s": float(mae),
        "mae_improvement_s": float(delta),
        "mae_improvement_pct": float(delta / mae_base * 100.0),
    }])

    display(comparison)

    comp_path = PATH_DERIVED / "05_holdout_comparison_baseline_vs_histgbr.csv"
    comparison.to_csv(comp_path, index=False)
    print("Saved:", comp_path)

print("Hold-out MAE:", f"{mae:.2f}s ({mae/60:.2f} min)", "| RMSE:", f"{rmse:.2f}s", "| R2:", f"{r2:.4f}")

Unnamed: 0,block,model,mae_s,rmse_s,r2,mae_min,rmse_min
0,"Hold-out – HistGBR (FE, train-tuned)",HistGBR depth=8 lr=0.05 it=300,583.1639,1412.42698,0.928202,9.719398,23.54045


Saved: /Users/justuspfeifer/Documents/AML/aml-justus-pfeifer/data_derived/05_holdout_metrics_histgbr.csv
Saved: /Users/justuspfeifer/Documents/AML/aml-justus-pfeifer/data_derived/05_holdout_predictions_histgbr.csv
pred_df shape: (1848, 10)


Unnamed: 0,block,model,mae_s,rmse_s,r2,mae_min,rmse_min
0,Hold-out – Baseline (no-leak),"Ridge (degree=1, alpha=0.01)",746.736998,1603.895606,0.907417,12.445617,26.731593


Unnamed: 0,baseline_model,baseline_mae_s,histgbr_model,histgbr_mae_s,mae_improvement_s,mae_improvement_pct
0,"Ridge (degree=1, alpha=0.01)",746.736998,HistGBR depth=8 lr=0.05 it=300,583.1639,163.573098,21.905048


Saved: /Users/justuspfeifer/Documents/AML/aml-justus-pfeifer/data_derived/05_holdout_comparison_baseline_vs_histgbr.csv
Hold-out MAE: 583.16s (9.72 min) | RMSE: 1412.43s | R2: 0.9282
