# F1 Race Summaries (2022–2024)

Self-contained notebook that loads raw Ergast CSV exports, engineers race-level features, and generates narrative summaries plus highlight links for recent seasons.


## Setup


In [1]:
from pathlib import Path
from functools import reduce
from urllib.parse import quote_plus

import pandas as pd

pd.set_option('display.max_columns', 60)

PROJECT_ROOT = Path.cwd().resolve().parents[0]
DATA_DIR = PROJECT_ROOT / 'data'
PROJECT_ROOT, DATA_DIR


(PosixPath('/Users/maccarlton/FunProjects/f1'),
 PosixPath('/Users/maccarlton/FunProjects/f1/data'))

## Helpers


In [3]:
def load_csv(name: str, *, parse_dates=None, **kwargs) -> pd.DataFrame:
    path = DATA_DIR / name
    df = pd.read_csv(path, parse_dates=parse_dates, **kwargs)
    print(f'Loaded {name}: {df.shape[0]:,} rows x {df.shape[1]} cols')
    return df


## Load Core Tables


In [4]:
races = load_csv('races.csv', parse_dates=['date']).rename(columns={'year': 'season', 'name': 'race_name'})
results = load_csv('results.csv')
pit_stops = load_csv('pit_stops.csv')
lap_times = load_csv('lap_times.csv')
drivers = load_csv('drivers.csv')
constructors = load_csv('constructors.csv')
status = load_csv('status.csv')

races[['season', 'round', 'race_name']].head()


Loaded races.csv: 1,125 rows x 18 cols
Loaded results.csv: 26,759 rows x 18 cols
Loaded pit_stops.csv: 11,371 rows x 7 cols
Loaded lap_times.csv: 589,081 rows x 6 cols
Loaded drivers.csv: 861 rows x 9 cols
Loaded constructors.csv: 212 rows x 5 cols
Loaded status.csv: 139 rows x 2 cols


Unnamed: 0,season,round,race_name
0,2009,1,Australian Grand Prix
1,2009,2,Malaysian Grand Prix
2,2009,3,Chinese Grand Prix
3,2009,4,Bahrain Grand Prix
4,2009,5,Spanish Grand Prix


## Scope: 2022–2024 Seasons


In [5]:
TARGET_SEASONS = list(range(2022, 2025))
races_scope = races[races['season'].isin(TARGET_SEASONS)].copy()
race_ids = set(races_scope['raceId'])

results_scope = results[results['raceId'].isin(race_ids)].copy()
pit_scope = pit_stops[pit_stops['raceId'].isin(race_ids)].copy()
lap_scope = lap_times[lap_times['raceId'].isin(race_ids)].copy()

print(f'Seasons: {TARGET_SEASONS}')
print(f'Races: {len(race_ids)} | Results rows: {len(results_scope):,} | Pit rows: {len(pit_scope):,}')
races_scope[['season', 'race_name']].drop_duplicates().tail()


Seasons: [2022, 2023, 2024]
Races: 68 | Results rows: 1,359 | Pit rows: 2,542


Unnamed: 0,season,race_name
1120,2024,Mexico City Grand Prix
1121,2024,São Paulo Grand Prix
1122,2024,Las Vegas Grand Prix
1123,2024,Qatar Grand Prix
1124,2024,Abu Dhabi Grand Prix


## Enrich Results with Labels


In [6]:
drivers = drivers.assign(driver_name=drivers['forename'] + ' ' + drivers['surname'])
constructors = constructors.rename(columns={'name': 'constructor_name', 'nationality': 'constructor_nationality'})
status = status.rename(columns={'status': 'status_label'})

race_meta = races_scope[['raceId', 'season', 'round', 'race_name', 'date']].copy()

race_results = (
    results_scope
    .merge(race_meta, on='raceId', how='left')
    .merge(drivers[['driverId', 'code', 'driver_name', 'nationality']], on='driverId', how='left')
    .merge(
        constructors[['constructorId', 'constructor_name', 'constructor_nationality']],
        on='constructorId',
        how='left',
    )
    .merge(status[['statusId', 'status_label']], on='statusId', how='left')
)

race_results[['season', 'race_name', 'positionText', 'driver_name', 'constructor_name', 'status_label']].head()


Unnamed: 0,season,race_name,positionText,driver_name,constructor_name,status_label
0,2022,Bahrain Grand Prix,1,Charles Leclerc,Ferrari,Finished
1,2022,Bahrain Grand Prix,2,Carlos Sainz,Ferrari,Finished
2,2022,Bahrain Grand Prix,3,Lewis Hamilton,Mercedes,Finished
3,2022,Bahrain Grand Prix,4,George Russell,Mercedes,Finished
4,2022,Bahrain Grand Prix,5,Kevin Magnussen,Haas F1 Team,Finished


## Pit Strategy Signatures


In [7]:
pit_scope = pit_scope.merge(races_scope[['raceId', 'season']], on='raceId', how='left')
strategy_features = (
    pit_scope
    .sort_values(['raceId', 'driverId', 'stop'])
    .groupby(['raceId', 'driverId'])
    .agg(
        stop_count=('stop', 'count'),
        stop_laps_tuple=('lap', lambda laps: tuple(int(x) for x in laps)),
    )
    .reset_index()
)
strategy_features['stop_laps_repr'] = strategy_features['stop_laps_tuple'].apply(
    lambda laps: ', '.join(map(str, laps)) if laps else 'no stops'
)
strategy_features['strategy_key'] = strategy_features.apply(
    lambda row: f"{row['stop_count']}|{row['stop_laps_repr']}",
    axis=1,
)

strategy_all = (
    race_results[['raceId', 'driverId', 'driver_name', 'constructor_name', 'season', 'race_name', 'positionOrder', 'positionText']]
    .merge(
        strategy_features[['raceId', 'driverId', 'stop_count', 'stop_laps_tuple', 'stop_laps_repr', 'strategy_key']],
        on=['raceId', 'driverId'],
        how='left',
    )
)
strategy_all['stop_count'] = strategy_all['stop_count'].fillna(0).astype(int)
strategy_all['stop_laps_tuple'] = strategy_all['stop_laps_tuple'].apply(
    lambda laps: laps if isinstance(laps, tuple) else tuple()
)
strategy_all['stop_laps_repr'] = strategy_all['stop_laps_tuple'].apply(
    lambda laps: ', '.join(map(str, laps)) if laps else 'no stops'
)
strategy_all['strategy_key'] = strategy_all.apply(
    lambda row: f"{row['stop_count']}|{row['stop_laps_repr']}",
    axis=1
)
strategy_all['positionOrder'] = pd.to_numeric(strategy_all['positionOrder'], errors='coerce')

strategy_counts = strategy_all.groupby(['raceId', 'strategy_key']).size().reset_index(name='strategy_occurrences')
strategy_all = strategy_all.merge(strategy_counts, on=['raceId', 'strategy_key'], how='left')
strategy_all['strategy_unique'] = strategy_all['strategy_occurrences'] == 1

strategy_all.head()


Unnamed: 0,raceId,driverId,driver_name,constructor_name,season,race_name,positionOrder,positionText,stop_count,stop_laps_tuple,stop_laps_repr,strategy_key,strategy_occurrences,strategy_unique
0,1074,844,Charles Leclerc,Ferrari,2022,Bahrain Grand Prix,1,1,3,"(15, 31, 46)","15, 31, 46","3|15, 31, 46",1,True
1,1074,832,Carlos Sainz,Ferrari,2022,Bahrain Grand Prix,2,2,3,"(14, 33, 44)","14, 33, 44","3|14, 33, 44",1,True
2,1074,1,Lewis Hamilton,Mercedes,2022,Bahrain Grand Prix,3,3,3,"(11, 27, 44)","11, 27, 44","3|11, 27, 44",1,True
3,1074,847,George Russell,Mercedes,2022,Bahrain Grand Prix,4,4,3,"(15, 33, 45)","15, 33, 45","3|15, 33, 45",1,True
4,1074,825,Kevin Magnussen,Haas F1 Team,2022,Bahrain Grand Prix,5,5,3,"(14, 34, 46)","14, 34, 46","3|14, 34, 46",1,True


## Strategy Regret Proxy (Finishers Only)


In [8]:
regret_base = race_results.copy()
regret_base['milliseconds'] = pd.to_numeric(regret_base['milliseconds'], errors='coerce')
regret_base['laps'] = pd.to_numeric(regret_base['laps'], errors='coerce')

finish_mask = (
    regret_base['status_label'].str.contains('Finished', case=False, na=False)
    | regret_base['status_label'].str.startswith('+', na=False)
)
regret_finishers = regret_base.loc[finish_mask].dropna(subset=['milliseconds', 'laps'])

regret_finishers['race_best_ms'] = regret_finishers.groupby('raceId')['milliseconds'].transform('min')
regret_finishers['best_same_lap_ms'] = regret_finishers.groupby(['raceId', 'laps'])['milliseconds'].transform('min')
regret_finishers['regret_vs_winner_ms'] = regret_finishers['milliseconds'] - regret_finishers['race_best_ms']
regret_finishers['regret_same_lap_ms'] = regret_finishers['milliseconds'] - regret_finishers['best_same_lap_ms']
regret_finishers['max_laps_in_race'] = regret_finishers.groupby('raceId')['laps'].transform('max')
regret_finishers['on_lead_lap'] = regret_finishers['laps'] == regret_finishers['max_laps_in_race']

regret_finishers[['season', 'race_name', 'driver_name', 'regret_vs_winner_ms']].head()


Unnamed: 0,season,race_name,driver_name,regret_vs_winner_ms
0,2022,Bahrain Grand Prix,Charles Leclerc,0.0
1,2022,Bahrain Grand Prix,Carlos Sainz,5598.0
2,2022,Bahrain Grand Prix,Lewis Hamilton,9675.0
3,2022,Bahrain Grand Prix,George Russell,11211.0
4,2022,Bahrain Grand Prix,Kevin Magnussen,14754.0


## Race-Level Feature Engineering


In [9]:
podium_df = (
    race_results[race_results['positionOrder'] <= 3]
    .sort_values(['raceId', 'positionOrder'])
    .groupby('raceId')
    .agg(
        winner=('driver_name', lambda s: s.iloc[0] if len(s) > 0 else None),
        runner_up=('driver_name', lambda s: s.iloc[1] if len(s) > 1 else None),
        third_place=('driver_name', lambda s: s.iloc[2] if len(s) > 2 else None),
        podium_list=('driver_name', lambda s: ', '.join(s)),
    )
    .reset_index()
)

pit_totals = pit_scope.groupby('raceId').agg(total_pit_stops=('stop', 'count')).reset_index()
driver_counts = race_results.groupby('raceId')['driverId'].nunique().reset_index(name='classified_drivers')
pit_summary = pit_totals.merge(driver_counts, on='raceId', how='right').fillna({'total_pit_stops': 0})
pit_summary['total_pit_stops'] = pit_summary['total_pit_stops'].astype(int)
pit_summary['avg_stops_per_driver'] = (pit_summary['total_pit_stops'] / pit_summary['classified_drivers']).round(2)

strategy_summary = (
    strategy_all.groupby('raceId')
    .agg(
        total_driver_strategies=('driverId', 'size'),
        unique_strategy_count=('strategy_unique', 'sum'),
    )
    .reset_index()
)
strategy_summary['unique_strategy_count'] = strategy_summary['unique_strategy_count'].astype(int)
strategy_summary['unique_strategy_share'] = strategy_summary.apply(
    lambda row: round(row['unique_strategy_count'] / row['total_driver_strategies'], 3) if row['total_driver_strategies'] else 0,
    axis=1
)

pos_df = race_results.copy()
pos_df['grid_num'] = pd.to_numeric(pos_df['grid'], errors='coerce')
pos_df.loc[pos_df['grid_num'] <= 0, 'grid_num'] = float('nan')
pos_df['positionOrder'] = pd.to_numeric(pos_df['positionOrder'], errors='coerce')
pos_df['position_gain'] = pos_df['grid_num'] - pos_df['positionOrder']

position_records = []
for race_id, group in pos_df.groupby('raceId'):
    top10 = group.sort_values('positionOrder').head(10)
    avg_gain = top10['position_gain'].dropna().mean()
    movers = group.dropna(subset=['position_gain'])
    if not movers.empty:
        idx = movers['position_gain'].idxmax()
        top_mover = movers.loc[idx, 'driver_name']
        top_gain = movers.loc[idx, 'position_gain']
    else:
        top_mover = None
        top_gain = None
    position_records.append({
        'raceId': race_id,
        'avg_position_gain_top10': round(avg_gain, 2) if pd.notna(avg_gain) else None,
        'top_mover': top_mover,
        'top_mover_gain': float(top_gain) if pd.notna(top_gain) else None,
    })
position_summary = pd.DataFrame(position_records)

regret_summary = regret_finishers.groupby('raceId')['regret_vs_winner_ms'].agg(max_regret_ms='max', median_regret_ms='median').reset_index()
regret_summary['max_regret_s'] = (regret_summary['max_regret_ms'] / 1000).round(3)
regret_summary['median_regret_s'] = (regret_summary['median_regret_ms'] / 1000).round(3)

race_results['is_classified_finish'] = finish_mask
dnf_summary = race_results.groupby('raceId').agg(total_classified=('driverId', 'size'), classified_finishers=('is_classified_finish', 'sum')).reset_index()
dnf_summary['dnf_count'] = (dnf_summary['total_classified'] - dnf_summary['classified_finishers']).astype(int)

race_features = reduce(
    lambda left, right: left.merge(right, on='raceId', how='left'),
    [
        races_scope[['raceId', 'season', 'round', 'race_name', 'date']],
        podium_df,
        pit_summary,
        strategy_summary,
        position_summary,
        regret_summary[['raceId', 'max_regret_s', 'median_regret_s']],
        dnf_summary[['raceId', 'dnf_count', 'total_classified']],
    ],
)

race_features = race_features.fillna({
    'unique_strategy_count': 0,
    'total_driver_strategies': 0,
    'unique_strategy_share': 0,
    'total_pit_stops': 0,
    'avg_stops_per_driver': 0,
    'dnf_count': 0,
    'top_mover_gain': 0,
})
race_features['unique_strategy_count'] = race_features['unique_strategy_count'].astype(int)
race_features['total_driver_strategies'] = race_features['total_driver_strategies'].astype(int)
race_features['total_pit_stops'] = race_features['total_pit_stops'].astype(int)
race_features['dnf_count'] = race_features['dnf_count'].astype(int)
race_features['top_mover_gain'] = race_features['top_mover_gain'].round(1)
race_features = race_features.sort_values(['season', 'round']).reset_index(drop=True)

race_features[['season', 'race_name', 'unique_strategy_share', 'avg_stops_per_driver']].head()


Unnamed: 0,season,race_name,unique_strategy_share,avg_stops_per_driver
0,2022,Bahrain Grand Prix,0.9,2.9
1,2022,Saudi Arabian Grand Prix,0.2,0.95
2,2022,Australian Grand Prix,0.4,1.1
3,2022,Emilia Romagna Grand Prix,0.3,1.25
4,2022,Miami Grand Prix,0.7,1.25


In [10]:
race_features[['season', 'race_name', 'dnf_count', 'max_regret_s']].tail()


Unnamed: 0,season,race_name,dnf_count,max_regret_s
63,2024,Mexico City Grand Prix,3,64.928
64,2024,São Paulo Grand Prix,5,79.649
65,2024,Las Vegas Grand Prix,2,91.005
66,2024,Qatar Grand Prix,5,62.656
67,2024,Abu Dhabi Grand Prix,4,83.821


## Generate Summaries


In [11]:
def build_summary(row: pd.Series) -> str:
    fragments = []
    podium = [p for p in (row.get('winner'), row.get('runner_up'), row.get('third_place')) if isinstance(p, str)]
    if podium:
        opening = f"{row['race_name']} {row['season']} saw {podium[0]} take the win"
        if len(podium) > 1:
            opening += f", ahead of {podium[1]}"
        if len(podium) > 2:
            opening += f" and {podium[2]}"
        fragments.append(opening + '.')
    share = row.get('unique_strategy_share', 0)
    if share and share >= 0.2:
        percent = int(share * 100)
        qualifier = 'Strategy roulette' if percent >= 35 else 'Creative pit timing'
        fragments.append(f"{qualifier} defined the race, with {percent}% of drivers on unique pit plans.")
    stops = row.get('avg_stops_per_driver')
    if stops and stops >= 2.5:
        fragments.append(f"High degradation forced an average of {stops:.1f} stops per driver.")
    mover = row.get('top_mover')
    gain = row.get('top_mover_gain')
    if isinstance(mover, str) and gain and gain >= 5:
        fragments.append(f"{mover} was the top mover, gaining {int(gain)} grid spots.")
    dnfs = row.get('dnf_count')
    if dnfs and dnfs >= 5:
        fragments.append(f"Attrition was heavy with {dnfs} retirements.")
    worst_regret = row.get('max_regret_s')
    if worst_regret and worst_regret >= 30:
        fragments.append(f"The biggest strategy miss cost about {int(worst_regret)} seconds.")
    if not fragments:
        fragments.append('A steady race delivered a straightforward finish for the front-runners.')
    return ' '.join(fragments)

race_features['summary_text'] = race_features.apply(build_summary, axis=1)
race_features['youtube_highlights_search'] = race_features.apply(
    lambda row: f"https://www.youtube.com/results?search_query={quote_plus(str(row['season']) + ' ' + row['race_name'] + ' F1 highlights')}",
    axis=1,
)

race_summaries = race_features[[
    'season', 'round', 'race_name', 'summary_text', 'youtube_highlights_search',
    'unique_strategy_share', 'avg_stops_per_driver', 'dnf_count', 'max_regret_s', 'top_mover', 'top_mover_gain',
]].copy()
race_summaries.head()


Unnamed: 0,season,round,race_name,summary_text,youtube_highlights_search,unique_strategy_share,avg_stops_per_driver,dnf_count,max_regret_s,top_mover,top_mover_gain
0,2022,1,Bahrain Grand Prix,Bahrain Grand Prix 2022 saw Charles Leclerc ta...,https://www.youtube.com/results?search_query=2...,0.9,2.9,3,63.829,Yuki Tsunoda,8.0
1,2022,2,Saudi Arabian Grand Prix,Saudi Arabian Grand Prix 2022 saw Max Verstapp...,https://www.youtube.com/results?search_query=2...,0.2,0.95,7,91.742,Lewis Hamilton,5.0
2,2022,3,Australian Grand Prix,Australian Grand Prix 2022 saw Charles Leclerc...,https://www.youtube.com/results?search_query=2...,0.4,1.1,3,88.598,Alexander Albon,10.0
3,2022,4,Emilia Romagna Grand Prix,Emilia Romagna Grand Prix 2022 saw Max Verstap...,https://www.youtube.com/results?search_query=2...,0.3,1.25,2,75.26,George Russell,7.0
4,2022,5,Miami Grand Prix,Miami Grand Prix 2022 saw Max Verstappen take ...,https://www.youtube.com/results?search_query=2...,0.7,1.25,5,73.305,Esteban Ocon,12.0


In [12]:
race_summaries[race_summaries['season'] == 2024].sample(3, random_state=42)


Unnamed: 0,season,round,race_name,summary_text,youtube_highlights_search,unique_strategy_share,avg_stops_per_driver,dnf_count,max_regret_s,top_mover,top_mover_gain
52,2024,9,Canadian Grand Prix,Canadian Grand Prix 2024 saw Max Verstappen ta...,https://www.youtube.com/results?search_query=2...,0.6,2.1,5,52.694,Esteban Ocon,8.0
60,2024,17,Azerbaijan Grand Prix,Azerbaijan Grand Prix 2024 saw Oscar Piastri t...,https://www.youtube.com/results?search_query=2...,0.55,1.1,4,148.841,Lando Norris,11.0
44,2024,1,Bahrain Grand Prix,Bahrain Grand Prix 2024 saw Max Verstappen tak...,https://www.youtube.com/results?search_query=2...,1.0,2.15,0,93.216,Guanyu Zhou,6.0


## Persist Outputs

Save race summaries and duel seeds for downstream services.

In [14]:
OUTPUT_DIR = PROJECT_ROOT / 'data' / 'processed'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

race_summaries_path = OUTPUT_DIR / 'race_summaries.parquet'
race_summaries_csv_path = OUTPUT_DIR / 'race_summaries.csv'
race_features_path = OUTPUT_DIR / 'race_features.parquet'

parquet_written = True
try:
    race_summaries.to_parquet(race_summaries_path, index=False)
    race_features.to_parquet(race_features_path, index=False)
except ImportError:
    parquet_written = False
    print('pyarrow/fastparquet not available; skipping parquet export. Install `pyarrow` to enable.')

race_summaries.to_csv(race_summaries_csv_path, index=False)

if parquet_written:
    race_summaries_path, race_summaries_csv_path, race_features_path
else:
    race_summaries_csv_path


pyarrow/fastparquet not available; skipping parquet export. Install `pyarrow` to enable.


## Duel Model Seed

Derive a prior excitement score per race using engineered features to warm-start the pairwise duel model.

In [15]:
def min_max(series: pd.Series) -> pd.Series:
    numeric = pd.to_numeric(series, errors='coerce').fillna(0)
    min_val, max_val = numeric.min(), numeric.max()
    if max_val == min_val:
        return pd.Series(0.5, index=series.index)
    return (numeric - min_val) / (max_val - min_val)

feature_matrix = pd.DataFrame({
    'unique_strategy': min_max(race_features['unique_strategy_share']),
    'pit_intensity': min_max(race_features['avg_stops_per_driver']),
    'attrition': min_max(race_features['dnf_count']),
    'regret_peak': min_max(race_features['max_regret_s']),
    'position_chaos': min_max(race_features['top_mover_gain']),
}, index=race_features.index)

weights = {
    'unique_strategy': 0.25,
    'pit_intensity': 0.2,
    'attrition': 0.15,
    'regret_peak': 0.25,
    'position_chaos': 0.15,
}
weighted_score = sum(feature_matrix[col] * w for col, w in weights.items())

# rescale to centered rating
score_z = (weighted_score - weighted_score.mean()) / (weighted_score.std(ddof=0) + 1e-6)
elo_baseline = 1500 + 120 * score_z
logit_prior = weighted_score.apply(lambda x: (x - 0.5) * 4)

duel_seed = race_features[['raceId', 'season', 'round', 'race_name']].copy()
duel_seed['prior_score'] = weighted_score.round(4)
duel_seed['elo_seed'] = elo_baseline.round(1)
duel_seed['logit_seed'] = logit_prior.round(3)
duel_seed['feature_weights'] = duel_seed.index.map(lambda idx: {
    'unique_strategy': feature_matrix.loc[idx, 'unique_strategy'],
    'pit_intensity': feature_matrix.loc[idx, 'pit_intensity'],
    'attrition': feature_matrix.loc[idx, 'attrition'],
    'regret_peak': feature_matrix.loc[idx, 'regret_peak'],
    'position_chaos': feature_matrix.loc[idx, 'position_chaos'],
})

duel_seed_path = OUTPUT_DIR / 'race_duel_seed.csv'
duel_seed.to_csv(duel_seed_path, index=False)

duel_seed.sort_values('elo_seed', ascending=False).head()


Unnamed: 0,raceId,season,round,race_name,prior_score,elo_seed,logit_seed,feature_weights
34,1111,2023,13,Dutch Grand Prix,0.6619,1755.6,0.648,"{'unique_strategy': 1.0, 'pit_intensity': 1.0,..."
5,1079,2022,6,Spanish Grand Prix,0.5974,1675.8,0.39,"{'unique_strategy': 1.0, 'pit_intensity': 0.41..."
18,1093,2022,19,United States Grand Prix,0.5874,1663.4,0.35,"{'unique_strategy': 1.0, 'pit_intensity': 0.20..."
41,1118,2023,20,São Paulo Grand Prix,0.5827,1657.5,0.331,"{'unique_strategy': 0.7368421052631579, 'pit_i..."
40,1117,2023,19,Mexico City Grand Prix,0.5802,1654.4,0.321,"{'unique_strategy': 0.7894736842105263, 'pit_i..."


## Next Steps

- Replace YouTube search URLs with curated highlight links.
- Persist `race_summaries` to Parquet/CSV for the web experience.
- Feed these features into the duel model to initialize excitement scores.
