# FastF1 Classification Baseline

This notebook loads the dataset and trains a simple logistic regression baseline.

In [1]:
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
FEATURES = ['avg_race_lap_time_s', 'quali_position', 'prev_points_total']
TARGET = 'scored_points'
data_path = Path('data/grandprix_features.csv')

df = pd.read_csv(data_path)
df = df.dropna(subset=[TARGET]).copy()
df[FEATURES] = df[FEATURES].apply(lambda s: s.fillna(s.median()))
df_model = df.dropna(subset=FEATURES, how='any').copy()

print(f"Rows after cleaning: {len(df_model)}")
df_model.head()

Rows after cleaning: 1359


Unnamed: 0,year,round,event,driver,team,quali_position,avg_race_lap_time_s,finish_position,points_awarded,prev_points_total,scored_points
0,2022,1,Bahrain Grand Prix,LEC,Ferrari,1,97.604208,1.0,26.0,0.0,1
1,2022,1,Bahrain Grand Prix,SAI,Ferrari,3,98.079957,2.0,18.0,0.0,1
2,2022,1,Bahrain Grand Prix,HAM,Mercedes,5,98.266244,3.0,15.0,0.0,1
3,2022,1,Bahrain Grand Prix,RUS,Mercedes,9,98.639022,4.0,12.0,0.0,1
4,2022,1,Bahrain Grand Prix,MAG,Haas F1 Team,7,98.852833,5.0,10.0,0.0,1


In [3]:
X = df_model[FEATURES]
y = df_model[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.shape, X_test.shape

((1087, 3), (272, 3))

In [4]:
# Logistic regression baseline (imputer + scaler)
log_reg = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    LogisticRegression(max_iter=200, class_weight='balanced'),
)
print("Pipeline:", log_reg)

cv_auc = cross_val_score(log_reg, X_train, y_train, cv=5, scoring='roc_auc')
print(f'CV ROC-AUC: {cv_auc.mean():.3f} ± {cv_auc.std():.3f}')

log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

print('\nHoldout classification report:')
print(classification_report(y_test, y_pred))
print('Holdout ROC-AUC:', roc_auc_score(y_test, y_proba))

Pipeline: Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=200))])
CV ROC-AUC: 0.863 ± 0.031

Holdout classification report:
              precision    recall  f1-score   support

           0       0.78      0.83      0.80       136
           1       0.82      0.76      0.79       136

    accuracy                           0.80       272
   macro avg       0.80      0.80      0.80       272
weighted avg       0.80      0.80      0.80       272

Holdout ROC-AUC: 0.8607807093425606
