# LightGBM-Regression: Rennzeit

Baseline mit One-Hot + Imputer + StandardScaler. Ziel: `race_time` (Sekunden).

In [6]:

from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

from lightgbm import LGBMRegressor

In [None]:

DATA_PATH = Path("../../data/regression/grandprix_features_all.csv")
CAT_COLS = ['driver_id', 'constructor_id', 'circuit_id']
NUM_COLS = ['year', 'round_number', 'grid_position', 'quali_delta', 'quali_tm_delta', 'season_pts_driver', 'season_pts_team', 'last_3_avg', 'is_street_circuit', 'is_wet']
TARGET = "race_time"

df = pd.read_csv(DATA_PATH)
df = df[df["race_time"].notna() & (df["race_time"] > 0)].copy()
print(df.shape)

LOG_TARGET = "race_time_log"
df[LOG_TARGET] = np.log1p(df[TARGET])


(2829, 15)


In [None]:
has_laps = "laps" in df.columns
if has_laps:
    df["race_time_per_lap"] = df["race_time"] / df["laps"].replace(0, pd.NA)
    med_per_year = df.groupby("year")["race_time_per_lap"].transform("median")
    df["race_time_norm"] = df["race_time_per_lap"] / med_per_year
else:
    med_per_year = df.groupby("year")["race_time"].transform("median")
    df["race_time_norm"] = df["race_time"] / med_per_year

LOG_TARGET = "race_time_norm_log"
df = df[df["race_time_norm"] > 0].copy()
df[LOG_TARGET] = np.log(df["race_time_norm"])


In [None]:
train_df = df[df["year"] <= 2022]
val_df = df[df["year"] == 2023]
test_df = df[df["year"] == 2024]

X_train = train_df[CAT_COLS + NUM_COLS]
y_train = train_df[LOG_TARGET]

X_val = val_df[CAT_COLS + NUM_COLS]
y_val = val_df[LOG_TARGET]
y_val_true = val_df["race_time"]

X_test = test_df[CAT_COLS + NUM_COLS]
y_test = test_df[LOG_TARGET]
y_test_true = test_df["race_time"]


In [10]:

preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_COLS),
    (
        "num",
        Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]),
        NUM_COLS,
    ),
])

model = LGBMRegressor(
    objective="regression",
    learning_rate=0.05,
    n_estimators=2000,
    num_leaves=63,
    max_depth=-1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1,
)

pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", model),
])

pipe.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1292
[LightGBM] [Info] Number of data points in the train set: 1612, number of used features: 79
[LightGBM] [Info] Start training from score -0.008648


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,2000
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
from math import sqrt

def metrics(y_true, y_pred):
    y_true = pd.Series(y_true).reset_index(drop=True)
    y_pred = pd.Series(y_pred).reset_index(drop=True)
    mask = y_true != 0
    y_true = y_true[mask].reset_index(drop=True)
    y_pred = y_pred[mask].reset_index(drop=True)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    mape = (np.abs((y_true - y_pred) / y_true)).median() * 100
    return mae, rmse, mape

for split, X, base_df, y_true in [
    ("val", X_val, val_df, y_val_true),
    ("test", X_test, test_df, y_test_true),
]:
    pred_log = pipe.predict(X)
    pred_ratio = np.exp(pred_log)
    if "laps" in base_df.columns:
        med_per_year = base_df.groupby("year")["race_time_per_lap"].transform("median")
        per_lap_pred = pred_ratio * med_per_year
        preds = per_lap_pred * base_df["laps"].reset_index(drop=True)
    else:
        med_per_year = base_df.groupby("year")["race_time"].transform("median")
        preds = pred_ratio * med_per_year
    mae, rmse, mape = metrics(y_true, preds)
    print(f"{split}: MAE={mae:.2f}s RMSE={rmse:.2f}s MAPE~{mape:.2f}%")


val: MAE=850.11s RMSE=1498.58s MAPE~7.88%
test: MAE=634.58s RMSE=939.44s MAPE~6.96%


