In [1]:
import fastf1 as ff1
from fastf1 import plotting
from fastf1.core import Laps

import pandas as pd
import numpy as np

In [2]:
rf_22 = pd.read_csv('features_race_2022.csv')
rf_23 = pd.read_csv('features_race_2023.csv')
rf_24 = pd.read_csv('features_race_2024.csv')
rf_25 = pd.read_csv('features_race_2025.csv')

rf = [rf_22, rf_23, rf_24, rf_25]

rf = pd.concat(rf, ignore_index=True)
rf.to_csv('features_race_p3.csv', index=False)

In [2]:
df_race = pd.read_csv('features_race_p3.csv', index_col=False)

In [3]:
df_race['Team'] = df_race['Team'].replace(['AlphaTauri', 'Racing Bulls'], 'RB')
df_race['Team'] = df_race['Team'].replace(['Alfa Romeo'], 'Kick Sauber')

In [5]:
df_race.value_counts('Team')

Team
McLaren            7727
Mercedes           7643
Red Bull Racing    7572
Ferrari            7483
Aston Martin       7468
RB                 7373
Haas F1 Team       7342
Kick Sauber        7273
Alpine             7122
Williams           7023
Name: count, dtype: int64

### Race mean lap time difference

In [4]:
race_mean_laptime_series = df_race.groupby(['Location', 'Year'])['LapTime'].transform('mean')

df_race['LapTime_Relative'] = df_race['LapTime'] - race_mean_laptime_series

### Exponential moving average TEAM lap time difference

In [5]:
df_race['EventDate'] = pd.to_datetime(df_race['EventDate'])

In [6]:
team_race_pace = df_race.groupby(['Team', 'Location', 'Year']).agg(
    # Get the team's average pace for that race
    Team_Avg_Relative_Pace=('LapTime_Relative', 'mean'),
    # Get the date of that race for sorting
    RaceDate=('EventDate', 'min') 
).reset_index()

team_race_pace = team_race_pace.sort_values(by='RaceDate')

# apply the .ewm() function to the sorted pace column.
# 'span=4' means the "current form" is roughly based on the last 4 races.
# You can tune this span. A smaller span reacts faster to changes.
# We use .transform() to apply the EMA calculation back to the team_race_pace DataFrame
team_race_pace['Team_Pace_EMA'] = team_race_pace.groupby('Team')['Team_Avg_Relative_Pace'] \
                                              .transform(lambda x: x.ewm(span=4, adjust=False).mean())

df_race_final = pd.merge(
    df_race,
    team_race_pace[['Team', 'Location', 'Year', 'Team_Pace_EMA']],
    on=['Team', 'Location', 'Year'],
    how='left'
)

### Teammate gap

In [7]:
# Calculate Average Driver Pace Per Race
driver_race_pace = df_race_final.groupby(['Driver', 'Team', 'Location', 'Year']).agg(
    # Get the team's average pace for that race
    Driver_Avg_Relative_Pace=('LapTime_Relative', 'mean'),
    # Get the date of that race for sorting
    RaceDate=('EventDate', 'min')
).reset_index()

teammate_comparison = pd.merge(
    driver_race_pace,
    driver_race_pace,
    on=['Team', 'Location', 'Year'],
    suffixes=('_driver', '_teammate')
)

# Filter out rows where a driver is being compared to themselves
gaps = teammate_comparison[teammate_comparison['Driver_driver'] != teammate_comparison['Driver_teammate']].copy()
# Gap Per Race
gaps['Teammate_Gap'] = gaps['Driver_Avg_Relative_Pace_driver'] - gaps['Driver_Avg_Relative_Pace_teammate']


driver_gaps = gaps[['Driver_driver', 'Team', 'Location', 'Year', 'RaceDate_driver', 'Teammate_Gap']]
driver_gaps = driver_gaps.rename(columns={'Driver_driver': 'Driver', 'RaceDate_driver': 'RaceDate'})
driver_gaps = driver_gaps.sort_values(by='RaceDate')

# Calculate the EMA of the Teammate Gap
driver_gaps['Driver_Teammate_Gap_EMA'] = driver_gaps.groupby('Driver')['Teammate_Gap'] \
                                                   .transform(lambda x: x.ewm(span=4, adjust=False).mean())

# Merge the Score Back to the Main DataFrame ---
df_race_final = pd.merge(
    df_race_final,
    driver_gaps[['Driver', 'Team', 'Location', 'Year', 'Driver_Teammate_Gap_EMA']],
    on=['Driver', 'Team', 'Location', 'Year'],
    how='left'
)

# Fill NaNs with 0 (a neutral starting point for drivers without teammates or first race)
df_race_final['Driver_Teammate_Gap_EMA'] = df_race_final['Driver_Teammate_Gap_EMA'].fillna(0)

### Driver consistency EMA

In [8]:
driver_consistency = df_race_final.groupby(['Driver', 'Location', 'Year']).agg(
    # Calculate the standard deviation of their relative lap times
    Driver_Consistency_Std=('LapTime_Relative', 'std'),
    # Get the date of that race for sorting
    RaceDate=('EventDate', 'min')
).reset_index()

# Handle NaNs in Std Dev
# If a driver had only 1 valid lap in a race, std() returns NaN. 
mean_consistency = driver_consistency['Driver_Consistency_Std'].mean()
driver_consistency['Driver_Consistency_Std'] = driver_consistency['Driver_Consistency_Std'].fillna(mean_consistency)

driver_consistency = driver_consistency.sort_values(by='RaceDate')

driver_consistency['Driver_Consistency_EMA'] = driver_consistency.groupby('Driver')['Driver_Consistency_Std'] \
                                                               .transform(lambda x: x.ewm(span=4, adjust=False).mean())

df_race_final = pd.merge(
    df_race_final,
    driver_consistency[['Driver', 'Location', 'Year', 'Driver_Consistency_EMA']],
    on=['Driver', 'Location', 'Year'],
    how='left'
)


### Tyre degradation slope

In [9]:
def calculate_degradation_slope(stint_laps_df, min_laps=5):
    """
    Calculates the slope of LapTime_Relative (Y) vs. TyreLife (X).
    Returns NaN if the stint is too short.
    """
    # Filter out any potential NaNs in the columns we need
    stint_laps_df = stint_laps_df.dropna(subset=['TyreLife', 'LapTime_Relative'])
    
    # Check if we have enough valid laps to calculate a meaningful slope
    if len(stint_laps_df) < min_laps:
        return np.nan
    
    # np.polyfit(X, Y, 1) returns [slope, intercept]
    # We only want the slope, which is the first element [0]
    try:
        slope, intercept = np.polyfit(
            stint_laps_df['TyreLife'], 
            stint_laps_df['LapTime_Relative'], 
            1
        )
        return slope
    except (np.linalg.LinAlgError, ValueError):
        # Handle cases where polyfit fails (e.g., all X values are the same)
        return np.nan

In [10]:
stint_slopes = df_race_final.groupby(['Driver', 'Team', 'Location', 'Year', 'Stint']).apply(calculate_degradation_slope)
stint_slopes = stint_slopes.reset_index(name='Degradation_Slope')

# Aggregate to Driver-Race Level
race_degradation = stint_slopes.groupby(['Driver', 'Location', 'Year'])['Degradation_Slope'].mean().reset_index()

# Handle NaNs (Pre-EMA)
# Fill NaNs (for drivers with no valid stints in a race) with the average
# slope of all other drivers. This assumes "average" performance when data is missing.
mean_slope = race_degradation['Degradation_Slope'].mean()
race_degradation['Degradation_Slope'] = race_degradation['Degradation_Slope'].fillna(mean_slope)


race_dates = df_race_final[['Driver', 'Location', 'Year', 'EventDate']].drop_duplicates()
race_degradation = pd.merge(
    race_degradation,
    race_dates,
    on=['Driver', 'Location', 'Year'],
    how='left'
)


race_degradation = race_degradation.sort_values(by='EventDate')
race_degradation['Degradation_EMA'] = race_degradation.groupby('Driver')['Degradation_Slope'] \
                                                    .transform(lambda x: x.ewm(span=4, adjust=False).mean())

df_race_final = pd.merge(
    df_race_final,
    race_degradation[['Driver', 'Location', 'Year', 'Degradation_EMA']],
    on=['Driver', 'Location', 'Year'],
    how='left'
)

df_race_final['Degradation_EMA'] = df_race_final['Degradation_EMA'].fillna(mean_slope)

  stint_slopes = df_race_final.groupby(['Driver', 'Team', 'Location', 'Year', 'Stint']).apply(calculate_degradation_slope)


In [11]:
df_race_final.to_csv('features_race_p3_stats.csv', index=False)

In [12]:
df_race_final = pd.read_csv('features_race_p3_stats.csv', index_col=False)

In [13]:
df_race_final

Unnamed: 0,Time,Driver,DriverNumber,Team,LapTime,LapNumber,Stint,Sector1Time,Sector2Time,Sector3Time,...,Hard,Location,Country,Year,EventDate,LapTime_Relative,Team_Pace_EMA,Driver_Teammate_Gap_EMA,Driver_Consistency_EMA,Degradation_EMA
0,0 days 01:06:03.288000,ALB,23,Williams,100.548,2.0,1.0,32.027,43.725,24.796,...,C1,Sakhir,Bahrain,2022,2022-03-20,1.137768,1.351977,-0.385488,1.410489,0.157463
1,0 days 01:07:43.952000,ALB,23,Williams,100.664,3.0,1.0,32.056,43.928,24.680,...,C1,Sakhir,Bahrain,2022,2022-03-20,1.253768,1.351977,-0.385488,1.410489,0.157463
2,0 days 01:09:25.078000,ALB,23,Williams,101.126,4.0,1.0,32.050,44.161,24.915,...,C1,Sakhir,Bahrain,2022,2022-03-20,1.715768,1.351977,-0.385488,1.410489,0.157463
3,0 days 01:11:07.381000,ALB,23,Williams,102.303,5.0,1.0,32.792,44.560,24.951,...,C1,Sakhir,Bahrain,2022,2022-03-20,2.892768,1.351977,-0.385488,1.410489,0.157463
4,0 days 01:12:49.089000,ALB,23,Williams,101.708,6.0,1.0,32.220,44.522,24.966,...,C1,Sakhir,Bahrain,2022,2022-03-20,2.297768,1.351977,-0.385488,1.410489,0.157463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74021,0 days 02:24:10.481000,VER,1,Red Bull Racing,98.192,52.0,2.0,26.253,39.435,32.504,...,C1,Austin,United States,2025,2025-10-19,-1.967763,-0.431059,-1.114680,0.980675,-0.006360
74022,0 days 02:25:48.588000,VER,1,Red Bull Racing,98.107,53.0,2.0,26.033,39.396,32.678,...,C1,Austin,United States,2025,2025-10-19,-2.052763,-0.431059,-1.114680,0.980675,-0.006360
74023,0 days 02:27:27.487000,VER,1,Red Bull Racing,98.899,54.0,2.0,26.165,39.575,33.159,...,C1,Austin,United States,2025,2025-10-19,-1.260763,-0.431059,-1.114680,0.980675,-0.006360
74024,0 days 02:29:06.445000,VER,1,Red Bull Racing,98.958,55.0,2.0,26.760,39.409,32.789,...,C1,Austin,United States,2025,2025-10-19,-1.201763,-0.431059,-1.114680,0.980675,-0.006360


# RACE FEATURE BUILDER

In [14]:
# Enable the cache by providing the name of the cache folder, speed up
ff1.Cache.enable_cache('cache')

# Setup plotting, setup the plot (bg: black, ...)
plotting.setup_mpl()

ff1.Cache.offline_mode(True)



In [15]:
session = ff1.get_session(2025, 'MOnza', 'Q')
session.load()

core           INFO 	Loading data for Italian Grand Prix - Qualifying [v3.5.3]
req            INFO 	Updating cache for session_info...
_api           INFO 	Fetching session info data...
req            INFO 	Cache updated!
req            INFO 	Updating cache for driver_info...
_api           INFO 	Fetching driver list...
req            INFO 	Cache updated!
req            INFO 	Updating cache for session_status_data...
_api           INFO 	Fetching session status data...
req            INFO 	Cache updated!
req            INFO 	Updating cache for track_status_data...
_api           INFO 	Fetching track status data...
req            INFO 	Cache updated!
req            INFO 	Updating cache for _extended_timing_data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INFO 	Cache updated!
req            INFO 	Updating cache for timing_app_data...
_api           INFO 	Fetching timing app data...
req            INFO 	Cache updated!
core    

In [None]:
team_race_pace = team_race_pace.sort_values(by='RaceDate')

# apply the .ewm() function to the sorted pace column.
# 'span=4' means the "current form" is roughly based on the last 4 races.
# You can tune this span. A smaller span reacts faster to changes.
# We use .transform() to apply the EMA calculation back to the team_race_pace DataFrame
team_race_pace['Team_Pace_EMA'] = team_race_pace.groupby('Team')['Team_Avg_Relative_Pace'] \
                                              .transform(lambda x: x.ewm(span=4, adjust=False).mean())

df_race_final = pd.merge(
    df_race,
    team_race_pace[['Team', 'Location', 'Year', 'Team_Pace_EMA']],
    on=['Team', 'Location', 'Year'],
    how='left'
)

In [62]:
team_race_pace.groupby(['Team', 'Location']).size()

Team      Location  
Alpine    Austin        4
          Baku          4
          Barcelona     4
          Budapest      3
          Imola         3
                       ..
Williams  Spielberg     4
          Suzuka        4
          São Paulo     3
          Yas Island    3
          Zandvoort     4
Length: 240, dtype: int64

In [41]:
import pandas as pd
import numpy as np

# --- ASSUMPTION: Load your fully consolidated Master Race Pace Data ---
# master_race_pace_df = pd.read_csv('your_master_race_pace_data.csv')

# --- 1. Calculate LapTime_Relative (Required for a meaningful pace score) ---
# First, calculate the mean lap time for *each race event* to normalize
location_year_mean = master_race_pace_df.groupby(['Location', 'Year'])['LapTime'].mean().rename('LapTime_Race_Mean')
master_race_pace_df = master_race_pace_df.merge(location_year_mean, on=['Location', 'Year'])

# Calculate the Relative Lap Time (A negative value is faster)
master_race_pace_df['LapTime_Relative'] = master_race_pace_df['LapTime'] - master_race_pace_df['LapTime_Race_Mean']


# --- 2. Define Aggregation Functions for Driver Score ---
driver_scores = master_race_pace_df.groupby(['Driver', 'Year']).agg(
    # The primary performance score: Mean of relative lap times.
    # A negative value means the driver was, on average, faster than the field.
    Driver_Relative_Pace=('LapTime_Relative', 'mean'), 
    
    # Consistency Index (Idea: Standard deviation of relative pace)
    # Lower value means the driver is more consistent
    Driver_Consistency_Index=('LapTime_Relative', 'std'),
    
    # Count of valid laps driven (a measure of data quality/completeness)
    Total_Valid_Laps=('LapTime', 'count'), 
    
    # Average environmental conditions experienced (useful for context, not a score)
    Avg_TrackTemp=('TrackTemp', 'mean'),
    
    # Keep the latest Team for the merge key (Crucial for Teammate Gap calculation later)
    Current_Team=('Team', lambda x: x.iloc[-1]) 
    
).reset_index()


# --- 3. Preview the Result ---
# This dataframe now provides one row per Driver per Year, 
# summarizing their average performance and consistency across the season.

print("--- Driver Seasonal Performance Scores Head ---")
print(driver_scores.head())

# Save this for later merging into your qualifying data
# driver_scores.to_csv('driver_seasonal_scores.csv', index=False)

NameError: name 'master_race_pace_df' is not defined

In [58]:
rf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74026 entries, 0 to 74025
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Time             74026 non-null  object 
 1   Driver           74026 non-null  object 
 2   DriverNumber     74026 non-null  int64  
 3   Team             74026 non-null  object 
 4   LapTime          74026 non-null  float64
 5   LapNumber        74026 non-null  float64
 6   Stint            74026 non-null  float64
 7   Sector1Time      74026 non-null  float64
 8   Sector2Time      74026 non-null  float64
 9   Sector3Time      74026 non-null  float64
 10  Compound         74026 non-null  object 
 11  TyreLife         74026 non-null  float64
 12  FreshTyre        74026 non-null  bool   
 13  Position         74026 non-null  float64
 14  AirTemp          74026 non-null  float64
 15  TrackTemp        74026 non-null  float64
 16  Rainfall         74026 non-null  bool   
 17  Humidity    

In [3]:
FILE_CIRCUITS = 'f1_unique_circuits_complete.csv'

circuit_info = pd.read_csv(FILE_CIRCUITS, delimiter=",")

In [7]:
session = ff1.get_session(2022, 'Sakhir', 'R')
session.load()

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '44', '63', '20', '77', '31', '22', '14', '24', '47', '18', '23', '3', '4', '6', '27', '11', '1', '10']


In [8]:
session.event

RoundNumber                                                   1
Country                                                 Bahrain
Location                                                 Sakhir
OfficialEventName    FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2022
EventDate                                   2022-03-20 00:00:00
EventName                                    Bahrain Grand Prix
EventFormat                                        conventional
Session1                                             Practice 1
Session1Date                          2022-03-18 15:00:00+03:00
Session1DateUtc                             2022-03-18 12:00:00
Session2                                             Practice 2
Session2Date                          2022-03-18 18:00:00+03:00
Session2DateUtc                             2022-03-18 15:00:00
Session3                                             Practice 3
Session3Date                          2022-03-19 15:00:00+03:00
Session3DateUtc                         

In [5]:
session.event['EventDate']

Timestamp('2022-09-11 00:00:00')

In [None]:
"""year = 2022

ff1.set_log_level('WARNING')

for location in circuit_info['Location'][23]:
    try:
        print(location + "    " + str(year))
        session = ff1.get_session(year, location, 'R')
        session.load()
    except Exception as e:
        print(f"Skipped {location} {year}: {e}")"""

Y    2022




a    2022




s    2022




     2022




I    2022




s    2022




l    2022




a    2022




n    2022




d    2022


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000001BB03D0B5B0>>
Traceback (most recent call last):
  File "c:\Users\lucam\anaconda3\envs\myenv_f1\lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [20]:
run_config = [
    (2022, 2, None),  # Per [2:]
    (2023, 1, None),  # Per [1:]
    (2024, 1, None),  # Per [1:]
    (2024, 1, 20)    # Per [1:20]
]

for YEAR, START, END in run_config:
    locations = ff1.get_event_schedule(YEAR)['Location'][START:END]

    all_races = []

    for LOCATION in locations:
        try:
            if LOCATION == 'Miami Gardens':
                LOCATION = 'Miami'

            session = ff1.get_session(YEAR, LOCATION, 'R')
            session.load()
            event_date = session.event['EventDate']
            laps = session.laps

            laps_filtered = laps.copy()
            # 1. Remove laps marked as inaccurate or where LapTime is missing
            laps_filtered = laps_filtered.loc[
                (laps_filtered['IsAccurate'] == True) &
                (laps_filtered['LapTime'].notnull())
            ]

            # 2. Remove in-laps and out-laps (around a pit stop)
            laps_filtered = laps_filtered.loc[
                (laps_filtered['PitInTime'].isnull()) &
                (laps_filtered['PitOutTime'].isnull())
            ]

            # 3. Remove laps run under Safety Car, VSC, or Red Flag
            # '1' means Green Flag / Clear Track
            laps_filtered = laps_filtered.loc[laps_filtered['TrackStatus'] == '1']

            laps_filtered = laps_filtered.copy()
            laps_filtered['Location'] = LOCATION
            laps_filtered['Year'] = YEAR
            laps_filtered['EventDate'] = session.event['EventDate']

            # Filter laps that are no more than 108% of the fastest lap time
            # This is a common tolerance used in racing for pace comparison
            # laps_filtered = laps_filtered.loc[laps_filtered['LapTime'] < fastest_lap_time * 1.08]

            laps_with__circuits = pd.merge(laps_filtered, circuit_info, on='Location', how='left')

            weather = session.weather_data
            laps_with_weather = pd.merge_asof(
                laps_with__circuits.sort_values('LapStartTime'),
                weather.sort_values('Time'),
                left_on='LapStartTime',
                right_on='Time',
                direction = 'backward'    # use last known weather measurement
            )

            laps_with_weather = laps_with_weather.rename(columns={'Time_x': 'Time'})
            laps_with_weather = laps_with_weather.rename(columns={'Time_y': 'WeatherTime'})
            laps_with_weather['EventDate'] = event_date

            FINAL_COLS = [
                'Time', 'Driver', 'DriverNumber', 'Team', 'LapTime', 'LapNumber', 'Stint',
                'Sector1Time', 'Sector2Time', 'Sector3Time', 'Compound', 'TyreLife', 'FreshTyre', 'Position',
                'AirTemp', 'TrackTemp', 'Rainfall', 'Humidity', 'WindSpeed', 'WindDirection', 
                'TrackLenght', 'NumTurns','NumLaps', 'RaceDistance', 'Traction', 'AsphaltGrip', 
                'AsphaltAbrasion', 'TrackEvolution', 'TyreStress', 'Braking', 'LateralLoad', 'Downforce', 
                'Soft', 'Medium', 'Hard', 'Location', 'Country', 'Year', 'EventDate'
            ]

            race_features = laps_with_weather[FINAL_COLS]

            race_features['LapTime'] = race_features['LapTime'].dt.total_seconds()
            race_features['Sector1Time'] = race_features['Sector1Time'].dt.total_seconds()
            race_features['Sector2Time'] = race_features['Sector2Time'].dt.total_seconds()
            race_features['Sector3Time'] = race_features['Sector3Time'].dt.total_seconds()

            race_features_sorted = (
                race_features
                .sort_values(['Driver', 'Time'])
                .groupby('Driver', group_keys=True)
                .apply(lambda x: x)
            )
            
            all_races.append(race_features_sorted)

        except Exception as e:
                print(f"Skipped {LOCATION} {YEAR}: {e}")

    all_features_df = pd.concat(all_races, ignore_index=True)

    if YEAR == 2025:
        all_features_df['Stint'] = all_features_df['Stint'].fillna(1.0)
        all_features_df['Compound'] = all_features_df.groupby(['Driver', 'Stint'])['Compound'].bfill().ffill()
        all_features_df.loc[all_features_df['Stint'] == 1.0, 'TyreLife'] = all_features_df['LapNumber']

    filename = 'features_race_' + str(YEAR) + '.csv'
    all_features_df.to_csv(filename, index=False)


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '44', '63', '20', '77', '31', '22', '14', '24', '47', '18', '23', '3', '4', '6', '27', '11', '1', '10']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

Skipped Le Castellet 2022: The data you are trying to access has not been loaded yet. See `Session.load`


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '44', '63', '55', '11', '16', '4', '14', '31', '5', '18', '10', '24', '47', '3', '20', '23', '6', '22', '77']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_features['LapTime'] = race_features['LapTime'].dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

Skipped Melbourne 2024: The data you are trying to access has not been loaded yet. See `Session.load`


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '4', '14', '63', '81', '44', '22', '27', '18', '20', '77', '31', '10', '2', '24', '3', '23']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_features['LapTime'] = race_features['LapTime'].dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

Skipped Melbourne 2024: The data you are trying to access has not been loaded yet. See `Session.load`


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '4', '14', '63', '81', '44', '22', '27', '18', '20', '77', '31', '10', '2', '24', '3', '23']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_features['LapTime'] = race_features['LapTime'].dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

In [19]:
YEAR = 2022
locations = ff1.get_event_schedule(YEAR)['Location'][2:6]

"""
import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)
"""

all_races = []

for LOCATION in locations:
    try:
        if LOCATION == 'Miami Gardens':
             LOCATION = 'Miami'

        session = ff1.get_session(YEAR, LOCATION, 'R')
        session.load()
        event_date = session.event['EventDate']
        print('AAAAAAAAAAA-----' + str(event_date))
        laps = session.laps

        laps_filtered = laps.copy()
        # 1. Remove laps marked as inaccurate or where LapTime is missing
        laps_filtered = laps_filtered.loc[
            (laps_filtered['IsAccurate'] == True) &
            (laps_filtered['LapTime'].notnull())
        ]

        # 2. Remove in-laps and out-laps (around a pit stop)
        laps_filtered = laps_filtered.loc[
            (laps_filtered['PitInTime'].isnull()) &
            (laps_filtered['PitOutTime'].isnull())
        ]

        # 3. Remove laps run under Safety Car, VSC, or Red Flag
        # '1' means Green Flag / Clear Track
        laps_filtered = laps_filtered.loc[laps_filtered['TrackStatus'] == '1']

        laps_filtered = laps_filtered.copy()
        laps_filtered['Location'] = LOCATION
        laps_filtered['Year'] = YEAR

        # Filter laps that are no more than 108% of the fastest lap time
        # This is a common tolerance used in racing for pace comparison
        # laps_filtered = laps_filtered.loc[laps_filtered['LapTime'] < fastest_lap_time * 1.08]

        laps_with__circuits = pd.merge(laps_filtered, circuit_info, on='Location', how='left')

        weather = session.weather_data
        laps_with_weather = pd.merge_asof(
            laps_with__circuits.sort_values('LapStartTime'),
            weather.sort_values('Time'),
            left_on='LapStartTime',
            right_on='Time',
            direction = 'backward'    # use last known weather measurement
        )

        laps_with_weather = laps_with_weather.rename(columns={'Time_x': 'Time'})
        laps_with_weather = laps_with_weather.rename(columns={'Time_y': 'WeatherTime'})
        laps_with_weather['EventDate'] = event_date

        FINAL_COLS = [
            'Time', 'Driver', 'DriverNumber', 'Team', 'LapTime', 'LapNumber', 'Stint',
            'Sector1Time', 'Sector2Time', 'Sector3Time', 'Compound', 'TyreLife', 'FreshTyre', 'Position',
            'AirTemp', 'TrackTemp', 'Rainfall', 'Humidity', 'WindSpeed', 'WindDirection', 
            'TrackLenght', 'NumTurns','NumLaps', 'RaceDistance', 'Traction', 'AsphaltGrip', 
            'AsphaltAbrasion', 'TrackEvolution', 'TyreStress', 'Braking', 'LateralLoad', 'Downforce', 
            'Soft', 'Medium', 'Hard', 'Location', 'Country', 'Year', 'EventDate'
        ]

        race_features = laps_with_weather[FINAL_COLS]

        race_features['LapTime'] = race_features['LapTime'].dt.total_seconds()
        race_features['Sector1Time'] = race_features['Sector1Time'].dt.total_seconds()
        race_features['Sector2Time'] = race_features['Sector2Time'].dt.total_seconds()
        race_features['Sector3Time'] = race_features['Sector3Time'].dt.total_seconds()


        race_features_sorted = (
            race_features
            .sort_values(['Driver', 'Time'])
            .groupby('Driver', group_keys=True)
            .apply(lambda x: x)
        )
        
        all_races.append(race_features_sorted)
    except Exception as e:
            print(f"Skipped {LOCATION} {YEAR}: {e}")

all_features_df = pd.concat(all_races, ignore_index=True)

filename = 'features_race__test_' + str(YEAR) + '.csv'
all_features_df.to_csv(filename, index=False)


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '44', '63', '20', '77', '31', '22', '14', '24', '47', '18', '23', '3', '4', '6', '27', '11', '1', '10']
A value is trying to be set on a copy of a slice from a DataFr

AAAAAAAAAAA-----2022-03-20 00:00:00


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '55', '11', '63', '31', '4', '10', '20', '44', '24', '27', '18', '23', '77', '14', '3', '6', '22', '47']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_features['LapTime'] = race_features['LapTime'].dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

AAAAAAAAAAA-----2022-03-27 00:00:00


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '11', '63', '44', '4', '3', '31', '77', '10', '23', '24', '18', '47', '20', '22', '6', '14', '1', '5', '55']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_features['LapTime'] = race_features['LapTime'].dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

AAAAAAAAAAA-----2022-04-10 00:00:00


req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '4', '63', '77', '16', '22', '5', '20', '18', '23', '10', '44', '31', '24', '6', '47', '3', '14', '55']


AAAAAAAAAAA-----2022-04-24 00:00:00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_features['LapTime'] = race_features['LapTime'].dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_features['Sector1Time'] = race_features['Sector1Time'].dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race_features['Sector2Time'] = race_features['Sector2Ti

In [21]:
features_race = pd.read_csv('features_race_2025.csv')

In [22]:
features_race.isna().sum()

Time               0
Driver             0
DriverNumber       0
Team               0
LapTime            0
LapNumber          0
Stint              0
Sector1Time        0
Sector2Time        0
Sector3Time        0
Compound           0
TyreLife           0
FreshTyre          0
Position           0
AirTemp            0
TrackTemp          0
Rainfall           0
Humidity           0
WindSpeed          0
WindDirection      0
TrackLenght        0
NumTurns           0
NumLaps            0
RaceDistance       0
Traction           0
AsphaltGrip        0
AsphaltAbrasion    0
TrackEvolution     0
TyreStress         0
Braking            0
LateralLoad        0
Downforce          0
Soft               0
Medium             0
Hard               0
Location           0
Country            0
Year               0
EventDate          0
dtype: int64

In [96]:
features_race.value_counts('Location')

Location
Budapest             1289
Monaco               1177
Montréal             1163
Marina Bay           1124
Zandvoort            1005
Suzuka                997
Spielberg             995
Barcelona             989
Sakhir                951
Shanghai              945
Imola                 942
Austin                926
Monza                 897
Miami Gardens         861
Jeddah                810
Baku                  796
Spa-Francorchamps     747
Melbourne             551
Silverstone           441
Name: count, dtype: int64

In [34]:
df = pd.read_csv('features_race_2025.csv')

In [35]:
df.isna().sum()

Time               0
Driver             0
DriverNumber       0
Team               0
LapTime            0
LapNumber          0
Stint              0
Sector1Time        0
Sector2Time        0
Sector3Time        0
Compound           0
TyreLife           0
FreshTyre          0
Position           0
AirTemp            0
TrackTemp          0
Rainfall           0
Humidity           0
WindSpeed          0
WindDirection      0
TrackLenght        0
NumTurns           0
NumLaps            0
RaceDistance       0
Traction           0
AsphaltGrip        0
AsphaltAbrasion    0
TrackEvolution     0
TyreStress         0
Braking            0
LateralLoad        0
Downforce          0
Soft               0
Medium             0
Hard               0
Location           0
Country            0
Year               0
EventDate          0
dtype: int64

In [30]:
df['Stint'] = df['Stint'].fillna(1.0)

# Group by both Driver and Stint
# .bfill() fills gaps from the next valid row *in that stint*
# .ffill() fills gaps from the previous valid row *in that stint*
df['Compound'] = df.groupby(['Driver', 'Stint'])['Compound'].bfill().ffill()


In [31]:
df.loc[df['Stint'] == 1.0, 'TyreLife'] = df['LapNumber']


In [None]:
# df.to_csv('features_race_2025.csv', index=False)