# XGBoost-Regression: Rennzeit

Hinweis: ben√∂tigt `xgboost` installiert.

In [None]:

from pathlib import Path
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from math import sqrt
from xgboost import XGBRegressor

In [None]:

DATA_PATH = Path("data/regression/grandprix_features_all.csv")
CAT_COLS = ['driver_id', 'constructor_id', 'circuit_id']
NUM_COLS = ['year', 'round_number', 'grid_position', 'quali_delta', 'quali_tm_delta', 'season_pts_driver', 'season_pts_team', 'last_3_avg', 'is_street_circuit', 'is_wet']
TARGET = "race_time"

df = pd.read_csv(DATA_PATH).dropna(subset=[TARGET]).copy()
train_df = df[df["year"] <= 2022]
val_df = df[df["year"] == 2023]
test_df = df[df["year"] == 2024]

X_train, y_train = train_df[CAT_COLS + NUM_COLS], train_df[TARGET]
X_val, y_val = val_df[CAT_COLS + NUM_COLS], val_df[TARGET]
X_test, y_test = test_df[CAT_COLS + NUM_COLS], test_df[TARGET]

In [None]:

preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_COLS),
    (
        "num",
        Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]),
        NUM_COLS,
    ),
])

xgb = XGBRegressor(
    objective="reg:squarederror",
    learning_rate=0.05,
    n_estimators=1200,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=1,
    random_state=42,
    n_jobs=-1,
)

pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", xgb),
])

pipe.fit(
    X_train,
    y_train,
    model__eval_set=[(preprocess.transform(X_val), y_val)],
    model__verbose=False,
)

In [None]:

for split, X, y in [("val", X_val, y_val), ("test", X_test, y_test)]:
    preds = pipe.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = sqrt(mean_squared_error(y, preds))
    mape = (np.abs((y - preds) / y).replace([np.inf, -np.inf], np.nan)).median() * 100
    print(f"{split}: MAE={mae:.2f}s RMSE={rmse:.2f}s MAPE~{mape:.2f}%")