# FastF1 Classification Baseline

This notebook loads the dataset and trains a simple logistic regression baseline.

In [None]:
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
from sklearn.metrics import balanced_accuracy_score


In [None]:
CAT_COLS = [
    'driver_id',
    'constructor_id',
    'circuit_id',
]
NUM_COLS = [
    'grid_position',
    'quali_delta',
    'quali_tm_delta',
    'season_pts_driver',
    'season_pts_team',
    'last_3_avg',
    'is_street_circuit',
    'is_wet',
]
TARGET = 'points_scored'
data_path = Path('data/grandprix_features.csv')

df_model = pd.read_csv(data_path)

# Basic NA handling / compatibility
df_model[CAT_COLS] = df_model[CAT_COLS].fillna('unknown')
df_model[NUM_COLS] = df_model[NUM_COLS].fillna(0)
df_model[TARGET] = df_model[TARGET].fillna(0).astype(int)

print(f"Rows after cleaning: {len(df_model)}")
df_model.head()


In [None]:
X = df_model[CAT_COLS + NUM_COLS]
y = df_model[TARGET]

preprocess = ColumnTransformer(
    [
        ('cat', OneHotEncoder(handle_unknown='ignore'), CAT_COLS),
        (
            'num',
            make_pipeline(SimpleImputer(strategy='median'), StandardScaler()),
            NUM_COLS,
        ),
    ],
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.shape, X_test.shape


In [None]:
# Logistic regression baseline (one-hot + scaler)
log_reg = make_pipeline(
    preprocess,
    LogisticRegression(max_iter=300, class_weight='balanced', n_jobs=-1),
)

# Fit on training data
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 (binary):', f1_score(y_test, y_pred))
print('F1 (macro):', f1_score(y_test, y_pred, average="macro"))
print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix (Test)")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()
