# Gradient Boosting Model

HistGradientBoostingClassifier baseline with cross-validation and holdout evaluation on the Grand Prix dataset.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    precision_score,
    recall_score,
    f1_score,
)
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display


In [None]:
DATA_PATH = Path('data/grandprix_features.csv')
FEATURES = ['year', 'round', 'avg_race_lap_time_s', 'quali_position', 'prev_points_total']
TARGET = 'scored_points'
RANDOM_STATE = 42


In [None]:
# Load data
df = pd.read_csv(DATA_PATH)
print(f'Rows: {len(df)}')
print(df.head())

X = df[FEATURES]
y = df[TARGET].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print('Train/Test shapes:', X_train.shape, X_test.shape)


In [None]:
# HistGradientBoosting pipeline
hgb = make_pipeline(
    SimpleImputer(strategy='median'),
    HistGradientBoostingClassifier(
        loss='log_loss',
        learning_rate=0.05,
        max_depth=5,
        max_iter=400,
        random_state=RANDOM_STATE,
    ),
)
print('Pipeline:', hgb)

# Cross-validation
cv_auc = cross_val_score(hgb, X_train, y_train, cv=5, scoring='roc_auc')
cv_pr = cross_val_score(hgb, X_train, y_train, cv=5, scoring='average_precision')
print(f'CV ROC-AUC: {cv_auc.mean():.3f} ± {cv_auc.std():.3f}')
print(f'CV PR-AUC (avg precision): {cv_pr.mean():.3f} ± {cv_pr.std():.3f}')

# Fit on training data
hgb.fit(X_train, y_train)

y_pred = hgb.predict(X_test)
y_proba = hgb.predict_proba(X_test)[:, 1]

# Holdout evaluation
print('
Holdout classification report:')
print(classification_report(y_test, y_pred))
print('Holdout ROC-AUC:', roc_auc_score(y_test, y_proba))
print('Holdout PR-AUC (avg precision):', average_precision_score(y_test, y_proba))
print('
Additional metrics:')
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 (binary):', f1_score(y_test, y_pred))
print('F1 (macro):', f1_score(y_test, y_pred, average='macro'))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix (raw):
', cm)

cm_df = pd.DataFrame(
    cm,
    index=[f'Actual {i}' for i in range(cm.shape[0])],
    columns=[f'Pred {i}' for i in range(cm.shape[1])],
)
print('
Confusion matrix as table:')
display(cm_df)

plt.figure(figsize=(6, 4))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()
