<h1><b>VERSION: 1</b></h1>
<h3>F1 Data Fetching Lap-Wise</h3>
<i><b>PROJECT: PITSTOP STRATEGY</b></i>

In [2]:
import pandas as pd
import numpy as np

In [None]:
import fastf1

fastf1.Cache.enable_cache("f1_cache")

TIRE_LIFE_ESTIMATE = {
    "SOFT": 15,
    "MEDIUM": 25,
    "HARD": 35,
    "INTERMEDIATE": 20,
    "WET": 20
}


def build_f1_dataset_for_year(season: int):

    all_rows = []
    schedule = fastf1.get_event_schedule(season, include_testing=False)

    for _, event in schedule.iterrows():
        round_number = event["RoundNumber"]
        race_name = event["EventName"]

        print(f"Processing {season} | Round {round_number} | {race_name}")

        try:
            session = fastf1.get_session(season, round_number, "R")
            session.load()
        except Exception:
            continue

        laps = session.laps.copy()
        weather = session.weather_data.copy()
        results = session.results.copy()

        # Build safe lookup tables
        results_by_driver = (
            results
            .reset_index()
            .set_index("Abbreviation", drop=False)
        )

        laps = laps.dropna(subset=["LapNumber", "Driver", "Time"])

        pit_laps = laps.dropna(subset=["PitInTime", "PitOutTime"]).copy()
        pit_laps["pit_duration"] = (
            pit_laps["PitOutTime"] - pit_laps["PitInTime"]
        ).dt.total_seconds()

        avg_pit_time_team = (
            pit_laps.groupby("Team")["pit_duration"].mean().to_dict()
        )

        for driver in laps["Driver"].unique():

            if driver not in results_by_driver.index:
                continue

            driver_laps = laps.pick_driver(driver).sort_values("LapNumber")

            last_lap_time = None
            last_position = None
            last_pit_lap = None
            pit_count = 0

            for _, lap in driver_laps.iterrows():

                if pd.isna(lap["LapTime"]) or pd.isna(lap["Time"]):
                    continue

                lap_number = int(lap["LapNumber"])
                lap_time = lap["LapTime"].total_seconds()

                pit_this_lap = int(pd.notna(lap["PitInTime"]))
                if pit_this_lap:
                    pit_count += 1
                    last_pit_lap = lap_number

                next_compound = None
                if pit_this_lap:
                    nxt = driver_laps[driver_laps["LapNumber"] == lap_number + 1]
                    if not nxt.empty:
                        next_compound = nxt.iloc[0]["Compound"]

                lap_delta = (
                    lap_time - last_lap_time
                    if last_lap_time is not None
                    else None
                )

                position_change = (
                    last_position - lap["Position"]
                    if last_position is not None
                    else 0
                )

                compound = lap["Compound"]
                tire_age = lap["TyreLife"]

                tire_life_remaining = (
                    TIRE_LIFE_ESTIMATE.get(compound, 0) - tire_age
                    if pd.notna(tire_age)
                    else None
                )

                stint_number = lap["Stint"]
                is_new_tire = int(tire_age == 1)
                laps_since_last_pit = (
                    lap_number - last_pit_lap
                    if last_pit_lap is not None
                    else lap_number
                )

                stint_laps = driver_laps[
                    driver_laps["Stint"] == stint_number
                ]["LapTime"].dropna()

                avg_lap_time_on_stint = (
                    stint_laps.dt.total_seconds().mean()
                    if not stint_laps.empty
                    else None
                )

                same_lap = laps[
                    (laps["LapNumber"] == lap_number) &
                    (laps["Time"].notna())
                ]

                cars_within_2s_ahead = (
                    ((same_lap["Time"] < lap["Time"]) &
                     (lap["Time"] - same_lap["Time"] < pd.Timedelta(seconds=2)))
                    .sum()
                )

                cars_within_2s_behind = (
                    ((same_lap["Time"] > lap["Time"]) &
                     (same_lap["Time"] - lap["Time"] < pd.Timedelta(seconds=2)))
                    .sum()
                )

                is_being_attacked = int(cars_within_2s_behind > 0)
                is_stuck_in_train = int(cars_within_2s_ahead >= 2)

                weather_before_lap = weather[
                    weather["Time"] <= lap["Time"]
                ]
                w = weather_before_lap.iloc[-1] if not weather_before_lap.empty else weather.iloc[0]

                res = results_by_driver.loc[driver]

                row = {
                    "season": season,
                    "round_number": round_number,
                    "race_name": race_name,
                    "driver": driver,
                    "team": lap["Team"],
                    "grid_position": res["GridPosition"],
                    "qualifying_position": res["Position"],

                    "lap_number": lap_number,
                    "lap_time": lap_time,
                    "lap_time_delta_prev": lap_delta,
                    "sector1_time": lap["Sector1Time"],
                    "sector2_time": lap["Sector2Time"],
                    "sector3_time": lap["Sector3Time"],
                    "personal_best": int(lap["IsPersonalBest"]),

                    "position": lap["Position"],
                    "position_change": position_change,
                    "gap_to_leader": lap["Time"],
                    "gap_to_car_ahead": lap["Time"] - same_lap["Time"].min(),

                    "current_compound": compound,
                    "tire_age_laps": tire_age,
                    "tire_life_remaining_est": tire_life_remaining,
                    "stint_number": stint_number,
                    "is_new_tire": is_new_tire,
                    "laps_since_last_pit": laps_since_last_pit,
                    "avg_lap_time_on_stint": avg_lap_time_on_stint,

                    "cars_within_2s_ahead": cars_within_2s_ahead,
                    "cars_within_2s_behind": cars_within_2s_behind,
                    "is_being_attacked": is_being_attacked,
                    "is_stuck_in_train": is_stuck_in_train,

                    "total_pit_stops_so_far": pit_count,
                    "last_pit_lap": last_pit_lap,
                    "avg_pit_time_team": avg_pit_time_team.get(lap["Team"], None),
                    "safety_car_pit": int(lap["TrackStatus"] in ["4", "6"]),

                    "track_temperature": w["TrackTemp"],
                    "air_temperature": w["AirTemp"],
                    "humidity": w["Humidity"],
                    "track_status": lap["TrackStatus"],

                    "pit_this_lap": pit_this_lap,
                    "next_tire_compound": next_compound
                }

                all_rows.append(row)

                last_lap_time = lap_time
                last_position = lap["Position"]

    df = pd.DataFrame(all_rows)
    filename = f"f1_dataset_year_{season}.csv"
    df.to_csv(filename, index=False)
    print(f"Saved dataset: {filename}")
    return df


if __name__ == "__main__":
    year = int(input("Enter F1 season year (2018â€“2024): "))
    build_f1_dataset_for_year(year)


In [2]:
df=pd.read_csv("f1_dataset_year_2021.csv")
df.head()
df.columns

Index(['season', 'round_number', 'race_name', 'driver', 'team',
       'grid_position', 'qualifying_position', 'lap_number', 'lap_time',
       'lap_time_delta_prev', 'sector1_time', 'sector2_time', 'sector3_time',
       'personal_best', 'position', 'position_change', 'gap_to_leader',
       'gap_to_car_ahead', 'current_compound', 'tire_age_laps',
       'tire_life_remaining_est', 'stint_number', 'is_new_tire',
       'laps_since_last_pit', 'avg_lap_time_on_stint', 'cars_within_2s_ahead',
       'cars_within_2s_behind', 'is_being_attacked', 'is_stuck_in_train',
       'total_pit_stops_so_far', 'last_pit_lap', 'avg_pit_time_team',
       'safety_car_pit', 'track_temperature', 'air_temperature', 'humidity',
       'track_status', 'pit_this_lap', 'next_tire_compound'],
      dtype='object')

**DATA SET MERGING**

In [1]:
import pandas as pd
import glob

# Create list of years
years = range(2020, 2025)
files = [f'f1_dataset_year_{year}.csv' for year in years]

df_list = []

for file in files:
    try:
        df = pd.read_csv(file)
        # It's good practice to ensure the season column is correct
        df_list.append(df)
        print(f"Successfully loaded {file}")
    except FileNotFoundError:
        print(f"File {file} not found, skipping...")

# Concatenate all dataframes
full_df = pd.concat(df_list, ignore_index=True)

# Save the master dataset
full_df.to_csv('f1_complete_dataset_2020_2024.csv', index=False)
print("Merged dataset saved as 'f1_complete_dataset_2020_2024.csv'")

Successfully loaded f1_dataset_year_2020.csv
Successfully loaded f1_dataset_year_2021.csv
Successfully loaded f1_dataset_year_2022.csv
Successfully loaded f1_dataset_year_2023.csv
Successfully loaded f1_dataset_year_2024.csv
Merged dataset saved as 'f1_complete_dataset_2020_2024.csv'
