# F1 Modern-Era Analysis — Season-Level Feature Engineering

This notebook creates structured, race-by-race driver and team features across the modern F1 era.  
These engineered features are designed to support clustering, predictive modeling, and explainability (SHAP).

### Feature engineering objectives
- Construct season-progress features (points, wins, finishes, podiums)
- Build recent-form metrics (last 3–5 races)
- Derive team-level strength indicators
- Detect DNF patterns and consistency metrics
- Organize a unified dataset suitable for machine learning models

The resulting dataset is stored in the `processed/` directory and becomes the input for the win-probability modeling notebook.

In [6]:
from pathlib import Path

# This notebook is inside: F1_analysis/notebooks/
# So project root is the parent folder of the current directory.
PROJECT_ROOT = Path().resolve().parent

DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR:", DATA_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)

PROJECT_ROOT: /Users/minseobeom/Desktop/f1-modern-era-prediction
DATA_DIR: /Users/minseobeom/Desktop/f1-modern-era-prediction/data
PROCESSED_DIR: /Users/minseobeom/Desktop/f1-modern-era-prediction/data/processed


## STEP 1 — Load Cleaned Dataset and Filter Modern Era

In [7]:
from pathlib import Path
import pandas as pd
import numpy as np

# STEP 1 — Resolve project root (Notebook is in <repo>/notebooks)
PROJECT_ROOT = Path.cwd().resolve().parent

DATA_DIR = PROJECT_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
clean_path = PROCESSED_DIR / "f1_race_results_clean.csv"

print("CWD:", Path.cwd().resolve())
print("PROJECT_ROOT:", PROJECT_ROOT)
print("CSV path:", clean_path)
print("CSV exists:", clean_path.exists())

if not clean_path.exists():
    raise FileNotFoundError(f"Missing file: {clean_path}")

df = pd.read_csv(clean_path, parse_dates=["race_date"])
print("Loaded dataset shape:", df.shape)

df.head()

CWD: /Users/minseobeom/Desktop/f1-modern-era-prediction/notebooks
PROJECT_ROOT: /Users/minseobeom/Desktop/f1-modern-era-prediction
CSV path: /Users/minseobeom/Desktop/f1-modern-era-prediction/data/processed/f1_race_results_clean.csv
CSV exists: True
Loaded dataset shape: (27218, 13)


Unnamed: 0,season,round,grand_prix,race_date,driver,team,grid,finish_position_raw,finish_position_num,is_dnf,points,laps,status
0,1950,1,British Grand Prix,1950-05-13,\N,Alfa Romeo,1,1,1.0,False,9.0,70,Finished
1,1950,1,British Grand Prix,1950-05-13,\N,Alfa Romeo,2,2,2.0,False,6.0,70,Finished
2,1950,1,British Grand Prix,1950-05-13,\N,Alfa Romeo,4,3,3.0,False,4.0,70,Finished
3,1950,1,British Grand Prix,1950-05-13,\N,Talbot-Lago,6,4,,True,3.0,68,+2 Laps
4,1950,1,British Grand Prix,1950-05-13,\N,Talbot-Lago,9,5,,True,2.0,68,+2 Laps


In [8]:
# Define modern era as last 10 seasons in the dataset
latest_season = df["season"].max()
last_n_seasons = 10
start_season = latest_season - last_n_seasons + 1

print("Latest season:", latest_season)
print(f"Modern era: {start_season}–{latest_season}")

modern_df = df[df["season"].between(start_season, latest_season)].copy()
modern_df = modern_df.sort_values(["season", "race_date", "round", "driver"]).reset_index(drop=True)

print("Modern-era subset shape:", modern_df.shape)
modern_df.head()

Latest season: 2025
Modern era: 2016–2025
Modern-era subset shape: (4300, 13)


Unnamed: 0,season,round,grand_prix,race_date,driver,team,grid,finish_position_raw,finish_position_num,is_dnf,points,laps,status
0,2016,1,Australian Grand Prix,2016-03-20,ALO,McLaren,11,\N,,True,0.0,16,Collision
1,2016,1,Australian Grand Prix,2016-03-20,BOT,Williams,16,8,8.0,False,4.0,57,Finished
2,2016,1,Australian Grand Prix,2016-03-20,BUT,McLaren,12,14,,True,0.0,56,+1 Lap
3,2016,1,Australian Grand Prix,2016-03-20,ERI,Sauber,15,\N,,True,0.0,38,Engine
4,2016,1,Australian Grand Prix,2016-03-20,GRO,Haas F1 Team,19,6,6.0,False,8.0,57,Finished


## STEP 2 — Create Race Order per Season and Driver

In [9]:
# Ensure there is a consistent race ordering within each season
modern_df["race_order"] = (
    modern_df
    .sort_values(["season", "race_date", "round"])
    .groupby("season")
    .cumcount() + 1
)

# Race order per driver (relative index of races within a season for each driver)
modern_df["driver_race_index"] = (
    modern_df
    .sort_values(["season", "race_date", "round"])
    .groupby(["season", "driver"])
    .cumcount() + 1
)

modern_df[["season", "round", "grand_prix", "driver", "race_order", "driver_race_index"]].head(20)

Unnamed: 0,season,round,grand_prix,driver,race_order,driver_race_index
0,2016,1,Australian Grand Prix,ALO,1,1
1,2016,1,Australian Grand Prix,BOT,2,1
2,2016,1,Australian Grand Prix,BUT,3,1
3,2016,1,Australian Grand Prix,ERI,4,1
4,2016,1,Australian Grand Prix,GRO,5,1
5,2016,1,Australian Grand Prix,GUT,6,1
6,2016,1,Australian Grand Prix,HAM,7,1
7,2016,1,Australian Grand Prix,HAR,8,1
8,2016,1,Australian Grand Prix,HUL,9,1
9,2016,1,Australian Grand Prix,KVY,10,1


## STEP 3 — Driver Season-to-Date Features (Before Each Race)

In [10]:
def add_driver_season_features(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.sort_values(["season", "race_date", "round", "driver"]).copy()

    # 1) win / podium flag
    df["is_win"] = (df["finish_position_num"] == 1).astype(int)
    df["is_podium"] = df["finish_position_num"].between(1, 3, inclusive="both").astype(int)

    # 2) group by season + driver
    grp = df.groupby(["season", "driver"], sort=False)

    # 3) season cumulative stats BEFORE current race (no leakage)
    df["season_points_before"] = grp["points"].transform(
        lambda s: s.cumsum().shift(1).fillna(0)
    )
    df["season_wins_before"] = grp["is_win"].transform(
        lambda s: s.cumsum().shift(1).fillna(0)
    )
    df["season_podiums_before"] = grp["is_podium"].transform(
        lambda s: s.cumsum().shift(1).fillna(0)
    )

    # 4) average finish BEFORE current race
    def avg_before(s: pd.Series) -> pd.Series:
        csum = s.cumsum()
        cnt = (~s.isna()).cumsum()
        return (csum / cnt).shift(1)

    df["avg_finish_before"] = grp["finish_position_num"].transform(avg_before)

    # 5) DNF rate BEFORE current race
    df["dnf_cum_before"] = grp["is_dnf"].transform(
        lambda s: s.cumsum().shift(1)
    )
    df["races_before"] = grp.cumcount()  # 0,1,2,... 

    df["dnf_rate_before"] = df["dnf_cum_before"] / df["races_before"].replace(0, np.nan)

    return df


# 다시 실행
driver_features_df = add_driver_season_features(modern_df)

driver_features_df[
    [
        "season", "round", "grand_prix", "driver",
        "season_points_before", "season_wins_before",
        "season_podiums_before", "avg_finish_before",
        "dnf_rate_before"
    ]
].head(20)

Unnamed: 0,season,round,grand_prix,driver,season_points_before,season_wins_before,season_podiums_before,avg_finish_before,dnf_rate_before
0,2016,1,Australian Grand Prix,ALO,0.0,0.0,0.0,,
1,2016,1,Australian Grand Prix,BOT,0.0,0.0,0.0,,
2,2016,1,Australian Grand Prix,BUT,0.0,0.0,0.0,,
3,2016,1,Australian Grand Prix,ERI,0.0,0.0,0.0,,
4,2016,1,Australian Grand Prix,GRO,0.0,0.0,0.0,,
5,2016,1,Australian Grand Prix,GUT,0.0,0.0,0.0,,
6,2016,1,Australian Grand Prix,HAM,0.0,0.0,0.0,,
7,2016,1,Australian Grand Prix,HAR,0.0,0.0,0.0,,
8,2016,1,Australian Grand Prix,HUL,0.0,0.0,0.0,,
9,2016,1,Australian Grand Prix,KVY,0.0,0.0,0.0,,


## STEP 4 — Driver Recent-Form Rolling Features


In [11]:
def add_driver_recent_form_features(df_in: pd.DataFrame, windows=(3, 5)) -> pd.DataFrame:
    df = df_in.sort_values(["season", "race_date", "round", "driver"]).copy()

    grp = df.groupby(["season", "driver"], sort=False, group_keys=False)

    for w in windows:
        # Rolling sum of points over last w races BEFORE current race
        df[f"points_last_{w}"] = grp["points"].apply(
            lambda s: s.shift(1).rolling(window=w, min_periods=1).sum()
        )

        # Rolling mean of finish position over last w races BEFORE current race
        df[f"avg_finish_last_{w}"] = grp["finish_position_num"].apply(
            lambda s: s.shift(1).rolling(window=w, min_periods=1).mean()
        )

    # Simple recent-form score (example): higher points, lower avg finish is better
    df["recent_form_score"] = (
        df["points_last_3"].fillna(0)
        - df["avg_finish_last_3"].fillna(df["avg_finish_last_3"].max())
    )

    return df


driver_features_df = add_driver_recent_form_features(driver_features_df, windows=(3, 5))

driver_features_df[
    [
        "season", "round", "grand_prix", "driver",
        "points_last_3", "avg_finish_last_3",
        "points_last_5", "avg_finish_last_5",
        "recent_form_score",
    ]
].head(20)

Unnamed: 0,season,round,grand_prix,driver,points_last_3,avg_finish_last_3,points_last_5,avg_finish_last_5,recent_form_score
0,2016,1,Australian Grand Prix,ALO,,,,,-20.0
1,2016,1,Australian Grand Prix,BOT,,,,,-20.0
2,2016,1,Australian Grand Prix,BUT,,,,,-20.0
3,2016,1,Australian Grand Prix,ERI,,,,,-20.0
4,2016,1,Australian Grand Prix,GRO,,,,,-20.0
5,2016,1,Australian Grand Prix,GUT,,,,,-20.0
6,2016,1,Australian Grand Prix,HAM,,,,,-20.0
7,2016,1,Australian Grand Prix,HAR,,,,,-20.0
8,2016,1,Australian Grand Prix,HUL,,,,,-20.0
9,2016,1,Australian Grand Prix,KVY,,,,,-20.0


## STEP 5 — Team-Level Season and Recent Features

In [12]:
def add_team_features(df_in: pd.DataFrame, windows=(3, 5)) -> pd.DataFrame:
    df = df_in.sort_values(["season", "race_date", "round", "team", "driver"]).copy()

    grp_team = df.groupby(["season", "team"], sort=False, group_keys=False)

    # Team cumulative points before current race
    df["team_points_before"] = grp_team["points"].transform(
        lambda s: s.cumsum().shift(1).fillna(0)
    )

    # Team average finish before current race (classified finishes only)
    def team_avg_before(s: pd.Series) -> pd.Series:
        csum = s.cumsum()
        cnt = (~s.isna()).cumsum()
        return (csum / cnt).shift(1)

    df["team_avg_finish_before"] = grp_team["finish_position_num"].transform(team_avg_before)

    # Rolling team performance over last w races (using all team entries)
    for w in windows:
        df[f"team_points_last_{w}"] = grp_team["points"].apply(
            lambda s: s.shift(1).rolling(window=w, min_periods=1).sum()
        )

        df[f"team_avg_finish_last_{w}"] = grp_team["finish_position_num"].apply(
            lambda s: s.shift(1).rolling(window=w, min_periods=1).mean()
        )

    # Simple team strength index based on recent 5 races
    df["team_strength_index"] = (
        df["team_points_last_5"].fillna(0)
        - df["team_avg_finish_last_5"].fillna(df["team_avg_finish_last_5"].max())
    )

    return df


full_features_df = add_team_features(driver_features_df, windows=(3, 5))

full_features_df[
    [
        "season", "round", "grand_prix", "driver", "team",
        "team_points_before", "team_avg_finish_before",
        "team_points_last_3", "team_avg_finish_last_3",
        "team_points_last_5", "team_avg_finish_last_5",
        "team_strength_index",
    ]
].head(20)

Unnamed: 0,season,round,grand_prix,driver,team,team_points_before,team_avg_finish_before,team_points_last_3,team_avg_finish_last_3,team_points_last_5,team_avg_finish_last_5,team_strength_index
15,2016,1,Australian Grand Prix,RAI,Ferrari,0.0,,,,,,-19.0
20,2016,1,Australian Grand Prix,VET,Ferrari,0.0,,0.0,,0.0,,-19.0
8,2016,1,Australian Grand Prix,HUL,Force India,0.0,,,,,,-19.0
14,2016,1,Australian Grand Prix,PER,Force India,6.0,7.0,6.0,7.0,6.0,7.0,-1.0
4,2016,1,Australian Grand Prix,GRO,Haas F1 Team,0.0,,,,,,-19.0
5,2016,1,Australian Grand Prix,GUT,Haas F1 Team,8.0,6.0,8.0,6.0,8.0,6.0,2.0
7,2016,1,Australian Grand Prix,HAR,Manor Marussia,0.0,,,,,,-19.0
21,2016,1,Australian Grand Prix,WEH,Manor Marussia,0.0,,0.0,,0.0,,-19.0
0,2016,1,Australian Grand Prix,ALO,McLaren,0.0,,,,,,-19.0
2,2016,1,Australian Grand Prix,BUT,McLaren,0.0,,0.0,,0.0,,-19.0


## STEP 6 — Build Modeling Dataset and Save

In [13]:
# Select essential columns for modeling
model_cols = [
    # race meta
    "season", "round", "race_date", "grand_prix",
    "driver", "team", "grid",
    "finish_position_num", "is_dnf", "points",

    # driver season-to-date
    "season_points_before", "season_wins_before",
    "season_podiums_before", "avg_finish_before",
    "dnf_rate_before",

    # driver recent form
    "points_last_3", "avg_finish_last_3",
    "points_last_5", "avg_finish_last_5",
    "recent_form_score",

    # team features
    "team_points_before", "team_avg_finish_before",
    "team_points_last_3", "team_avg_finish_last_3",
    "team_points_last_5", "team_avg_finish_last_5",
    "team_strength_index",
]

model_df = full_features_df[model_cols].sort_values(
    ["season", "race_date", "round", "driver"]
).reset_index(drop=True)

print("Modeling dataset shape:", model_df.shape)
model_df.head()

Modeling dataset shape: (4300, 27)


Unnamed: 0,season,round,race_date,grand_prix,driver,team,grid,finish_position_num,is_dnf,points,...,points_last_5,avg_finish_last_5,recent_form_score,team_points_before,team_avg_finish_before,team_points_last_3,team_avg_finish_last_3,team_points_last_5,team_avg_finish_last_5,team_strength_index
0,2016,1,2016-03-20,Australian Grand Prix,ALO,McLaren,11,,True,0.0,...,,,-20.0,0.0,,,,,,-19.0
1,2016,1,2016-03-20,Australian Grand Prix,BOT,Williams,16,8.0,False,4.0,...,,,-20.0,0.0,,,,,,-19.0
2,2016,1,2016-03-20,Australian Grand Prix,BUT,McLaren,12,,True,0.0,...,,,-20.0,0.0,,0.0,,0.0,,-19.0
3,2016,1,2016-03-20,Australian Grand Prix,ERI,Sauber,15,,True,0.0,...,,,-20.0,0.0,,,,,,-19.0
4,2016,1,2016-03-20,Australian Grand Prix,GRO,Haas F1 Team,19,6.0,False,8.0,...,,,-20.0,0.0,,,,,,-19.0


In [14]:
# Save modeling dataset for Notebook 03
features_path = PROCESSED_DIR / "f1_features_modern_10seasons.csv"
model_df.to_csv(features_path, index=False)

print("Saved modeling dataset to:", features_path)

Saved modeling dataset to: /Users/minseobeom/Desktop/f1-modern-era-prediction/data/processed/f1_features_modern_10seasons.csv
