In [1]:
# Importing all packages
import os
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import shutil

# Enable cache memory for faster access
custom_cache_folder = "f1_cache"
os.makedirs(custom_cache_folder, exist_ok=True)
fastf1.Cache.enable_cache(custom_cache_folder)

# Import driver info table containing years of experience for each driver
experience = pd.read_csv("Dataset/DriverInfo.csv")

# Import a race name - Later this will be a user input
circuit_name = "Japanese Grand Prix"

# Function that gets race data
def get_race_data(year, circuit):
    """Retrieves race data including fastest laps and weather information"""
    try:
        session = fastf1.get_session(year, circuit, 'R')
        session.load()
        
        weather_data = session.weather_data
        all_drivers = session.laps["Driver"].unique()
        fastest_laps = []

        for driver in all_drivers:
            try:
                driver_laps = session.laps.pick_drivers(driver)
                if not driver_laps.empty:
                    fastest_lap = driver_laps.pick_fastest()
                    if not pd.isna(fastest_lap['LapTime']):
                        fastest_lap_seconds = fastest_lap['LapTime'].total_seconds()
                        fastest_laps.append([driver, fastest_lap_seconds])
                    else:
                        fastest_laps.append([driver, None])
                else:
                    fastest_laps.append([driver, None])
            except Exception as e:
                print(f"Error processing driver {driver} in {year}: {e}")
                fastest_laps.append([driver, None])

        fastest_lap_df = pd.DataFrame(fastest_laps, columns=["Abbreviation", "FastestLap"])
        
        results = session.results[["FullName", "Abbreviation", "TeamName", "GridPosition", "Position", "Status"]]
        results["Year"] = year
        results["Humidity"] = weather_data["Humidity"].mean()
        results["Temperature"] = weather_data["AirTemp"].mean()
        results["Rain"] = weather_data["Rainfall"].mean()

        results = results.merge(fastest_lap_df, on="Abbreviation")
        return results
    
    except Exception as e:
        print(f"Error getting race data for {year} {circuit}: {e}")
        return pd.DataFrame()

# Function that gets qualifying data
def get_qualifying_data(year, circuit):
    """Retrieves qualifying time and speed trap data"""
    try:
        session = fastf1.get_session(year, circuit, 'Q')
        session.load()

        all_drivers = session.laps["Driver"].unique()
        qualifying_data = []

        for driver in all_drivers:
            try:
                driver_laps = session.laps.pick_drivers(driver)
                if not driver_laps.empty:
                    fastest_lap = driver_laps.pick_fastest()
                    # Qualifying time
                    if 'LapTime' in fastest_lap and not pd.isna(fastest_lap['LapTime']):
                        qualifying_time = fastest_lap['LapTime'].total_seconds()
                    else:
                        qualifying_time = None
                    # Speed trap
                    speed_trap = fastest_lap['SpeedI2'] if 'SpeedI2' in fastest_lap else None
                else:
                    qualifying_time = None
                    speed_trap = None
                
                qualifying_data.append([driver, qualifying_time, speed_trap])
            except Exception as e:
                print(f"Error processing driver {driver} in {year}: {e}")
                qualifying_data.append([driver, None, None])

        qualifying_df = pd.DataFrame(
            qualifying_data,
            columns=["Abbreviation", "QualifyingTime", "SpeedTrapQualy"]
        )
        
        # Get qualifying position
        results = session.results[["Abbreviation", "Position"]]
        results = results.rename(columns={"Position": "QualifyingPosition"})
        
        # Merge with qualifying performance data
        qualifying_results = results.merge(qualifying_df, on="Abbreviation")
        qualifying_results["Year"] = year
        
        return qualifying_results
    
    except Exception as e:
        print(f"Error getting qualifying data for {year} {circuit}: {e}")
        return pd.DataFrame()

# Get data for all years
race_2022 = get_race_data(2022, circuit_name)
race_2023 = get_race_data(2023, circuit_name)
race_2024 = get_race_data(2024, circuit_name)

quali_2022 = get_qualifying_data(2022, circuit_name)
quali_2023 = get_qualifying_data(2023, circuit_name)
quali_2024 = get_qualifying_data(2024, circuit_name)

# Combine all data
all_races = pd.concat([race_2022, race_2023, race_2024], ignore_index=True)
all_qualis = pd.concat([quali_2022, quali_2023, quali_2024], ignore_index=True)

# Merge race and qualifying data
merged_data = pd.merge(
    all_races,
    all_qualis,
    on=["Abbreviation", "Year"],
    how="left"
)

# Add driver experience information
final_data = merged_data.merge(experience, on=["Abbreviation"])

# Calculate TimeRatio (QualifyingTime / FastestLap)
final_data["TimeRatio"] = final_data.apply(
    lambda row: row["QualifyingTime"] / row["FastestLap"] 
    if pd.notna(row["QualifyingTime"]) and pd.notna(row["FastestLap"]) and row["FastestLap"] != 0 
    else None,
    axis=1
)


# Clean up the final DataFrame
final_data = final_data[[
    'Year', 'FullName', 'Abbreviation', 'TeamName', 
    'QualifyingPosition', 'GridPosition', 'Position',
    'QualifyingTime', 'FastestLap', 'TimeRatio', 'SpeedTrapQualy',
    'Temperature', 'Humidity', 'Rain', 'Status','NumberOfRaces'
    # Add any columns from experience.csv that you need
]]

team_mappings = {
    'Alfa Romeo': 'Kick Sauber',
    'AlphaTauri': 'RB'
}

final_data['TeamName'] = final_data['TeamName'].replace(team_mappings)



final_data

core           INFO 	Loading data for Japanese Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '16', '31', '44', '5', '14', '63', '6', '4', '3', '18', '22', '20', '77', '24', '47', '10', '55', '23']
A value is trying to be set on a copy of a slice from a DataFr

req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '55', '11', '31', '44', '14', '63', '5', '4', '3', '77', '22', '24', '47', '23', '10', '20', '18', '6']
core           INFO 	Loading data for Japanese Grand Prix - Qualifying [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_inf

Unnamed: 0,Year,FullName,Abbreviation,TeamName,QualifyingPosition,GridPosition,Position,QualifyingTime,FastestLap,TimeRatio,SpeedTrapQualy,Temperature,Humidity,Rain,Status,NumberOfRaces
0,2022,Max Verstappen,VER,Red Bull Racing,1.0,1.0,1.0,89.304,104.911,0.851236,304.0,16.717647,86.211765,0.980392,Finished,211
1,2023,Max Verstappen,VER,Red Bull Racing,1.0,1.0,1.0,88.877,94.183,0.943663,305.0,27.288608,42.316456,0.0,Finished,211
2,2024,Max Verstappen,VER,Red Bull Racing,1.0,1.0,1.0,88.197,93.706,0.94121,297.0,21.69116,43.430939,0.0,Finished,211
3,2022,Sergio Perez,PER,Red Bull Racing,4.0,4.0,2.0,89.709,106.12,0.845354,303.0,16.717647,86.211765,0.980392,Finished,281
4,2023,Sergio Perez,PER,Red Bull Racing,5.0,5.0,19.0,89.65,99.704,0.899162,304.0,27.288608,42.316456,0.0,Collision damage,281
5,2024,Sergio Perez,PER,Red Bull Racing,2.0,2.0,2.0,88.263,93.945,0.939518,300.0,21.69116,43.430939,0.0,Finished,281
6,2022,Charles Leclerc,LEC,Ferrari,2.0,2.0,3.0,89.314,104.489,0.854769,300.0,16.717647,86.211765,0.980392,Finished,149
7,2023,Charles Leclerc,LEC,Ferrari,4.0,4.0,4.0,89.542,96.362,0.929225,303.0,27.288608,42.316456,0.0,Finished,149
8,2024,Charles Leclerc,LEC,Ferrari,8.0,8.0,4.0,88.786,95.044,0.934157,295.0,21.69116,43.430939,0.0,Finished,149
9,2022,Esteban Ocon,OCO,Alpine,5.0,5.0,4.0,90.165,106.559,0.846151,300.0,16.717647,86.211765,0.980392,Finished,158


## Encoding string type features

In [2]:
from sklearn.preprocessing import LabelEncoder

final_sessions_encoded = final_data.copy()

label_encoder_abbr = LabelEncoder()
label_encoder_team = LabelEncoder()

final_sessions_encoded["Abbreviation"] = label_encoder_abbr.fit_transform(final_sessions_encoded["Abbreviation"])
final_sessions_encoded["TeamName"] = label_encoder_team.fit_transform(final_sessions_encoded["TeamName"])

# Mapping for Drivers
driver_mapping = dict(zip(label_encoder_abbr.classes_, label_encoder_abbr.transform(label_encoder_abbr.classes_)))
print("Driver Encoding Mapping:", driver_mapping)

# Mapping for Teams
team_mapping = dict(zip(label_encoder_team.classes_, label_encoder_team.transform(label_encoder_team.classes_)))
print("Team Encoding Mapping:", team_mapping)

# Inverse process - if you need it 
#decoded_driver = label_encoder_abbr.inverse_transform([0, 1, 2])  # Convert 0,1,2 back to names
#decoded_team = label_encoder_team.inverse_transform([0, 1, 2]) 

#print("Decoded Drivers:", decoded_driver)
#print("Decoded Teams:", decoded_team)

# Check for NaN values
print(final_sessions_encoded.isnull().sum())

# Fill NaN values with mean values in those columns
for col in ["QualifyingTime", "FastestLap", "TimeRatio", "SpeedTrapQualy"]:
    final_sessions_encoded[col] = final_sessions_encoded[col].fillna(final_sessions_encoded[col].mean())

# Verify that there are no NaN values
print(final_sessions_encoded.isnull().sum())

mean_speeds = final_sessions_encoded.groupby('FullName')['SpeedTrapQualy'].mean().reset_index()
print("Mean speeds are: ", mean_speeds)
final_sessions_encoded['MeanSpeedTrap'] = final_sessions_encoded.groupby('FullName')['SpeedTrapQualy'].transform('mean')
final_sessions_encoded['MeanTimeRatio'] = final_sessions_encoded.groupby('FullName')['TimeRatio'].transform('mean')

final_sessions_encoded


Driver Encoding Mapping: {'ALB': 0, 'ALO': 1, 'BOT': 2, 'GAS': 3, 'HAM': 4, 'HUL': 5, 'LAT': 6, 'LAW': 7, 'LEC': 8, 'MAG': 9, 'NOR': 10, 'OCO': 11, 'PER': 12, 'PIA': 13, 'RIC': 14, 'RUS': 15, 'SAI': 16, 'SAR': 17, 'STR': 18, 'TSU': 19, 'VER': 20, 'VET': 21, 'ZHO': 22}
Team Encoding Mapping: {'Alpine': 0, 'Aston Martin': 1, 'Ferrari': 2, 'Haas F1 Team': 3, 'Kick Sauber': 4, 'McLaren': 5, 'Mercedes': 6, 'RB': 7, 'Red Bull Racing': 8, 'Williams': 9}
Year                  0
FullName              0
Abbreviation          0
TeamName              0
QualifyingPosition    0
GridPosition          0
Position              0
QualifyingTime        1
FastestLap            4
TimeRatio             5
SpeedTrapQualy        1
Temperature           0
Humidity              0
Rain                  0
Status                0
NumberOfRaces         0
dtype: int64
Year                  0
FullName              0
Abbreviation          0
TeamName              0
QualifyingPosition    0
GridPosition          0
Position

Unnamed: 0,Year,FullName,Abbreviation,TeamName,QualifyingPosition,GridPosition,Position,QualifyingTime,FastestLap,TimeRatio,SpeedTrapQualy,Temperature,Humidity,Rain,Status,NumberOfRaces,MeanSpeedTrap,MeanTimeRatio
0,2022,Max Verstappen,20,8,1.0,1.0,1.0,89.304,104.911,0.851236,304.0,16.717647,86.211765,0.980392,Finished,211,302.0,0.912036
1,2023,Max Verstappen,20,8,1.0,1.0,1.0,88.877,94.183,0.943663,305.0,27.288608,42.316456,0.0,Finished,211,302.0,0.912036
2,2024,Max Verstappen,20,8,1.0,1.0,1.0,88.197,93.706,0.94121,297.0,21.69116,43.430939,0.0,Finished,211,302.0,0.912036
3,2022,Sergio Perez,12,8,4.0,4.0,2.0,89.709,106.12,0.845354,303.0,16.717647,86.211765,0.980392,Finished,281,302.333333,0.894678
4,2023,Sergio Perez,12,8,5.0,5.0,19.0,89.65,99.704,0.899162,304.0,27.288608,42.316456,0.0,Collision damage,281,302.333333,0.894678
5,2024,Sergio Perez,12,8,2.0,2.0,2.0,88.263,93.945,0.939518,300.0,21.69116,43.430939,0.0,Finished,281,302.333333,0.894678
6,2022,Charles Leclerc,8,2,2.0,2.0,3.0,89.314,104.489,0.854769,300.0,16.717647,86.211765,0.980392,Finished,149,299.333333,0.90605
7,2023,Charles Leclerc,8,2,4.0,4.0,4.0,89.542,96.362,0.929225,303.0,27.288608,42.316456,0.0,Finished,149,299.333333,0.90605
8,2024,Charles Leclerc,8,2,8.0,8.0,4.0,88.786,95.044,0.934157,295.0,21.69116,43.430939,0.0,Finished,149,299.333333,0.90605
9,2022,Esteban Ocon,11,0,5.0,5.0,4.0,90.165,106.559,0.846151,300.0,16.717647,86.211765,0.980392,Finished,158,301.666667,0.903162


## Scale values for temperature, humidity and fastest lap

In [3]:
# Check the MAE and if it doesn't look good, try to scale down!


## Creating and training a model

In [4]:
# For a time ration feature, i assume we have to calculate the average of each drivers ratio and speed trap?
features = ['Abbreviation', 'TeamName', 'GridPosition', 'QualifyingTime', 'Temperature', 'Humidity', 'Rain', 'NumberOfRaces','MeanSpeedTrap', 'MeanTimeRatio']

# We are predicting final race position, here we could encode Status as well?
target = 'Position'

# Elementary data sets
X = final_sessions_encoded[features]
y = final_sessions_encoded[target]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

# Training a model
model = GradientBoostingRegressor(n_estimators = 100, learning_rate = 0.1, random_state = 1)
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)
print("Mean Absolute Error : ", mean_absolute_error(y_test,y_pred))

Mean Absolute Error :  5.116588978469221


## Prediction Sample

In [5]:
# Creating a sample dataset for 2025 predictions
data_2025 = {
    "Abbreviation": [5, 22, 7, 9, 3, 4, 1, 6, 8, 0],  # Already encoded driver names
    "TeamName": [3, 1, 2, 3, 0, 0, 4, 1, 3, 2],  # Already encoded team names
    "GridPosition": [1, 3, 5, 9, 2, 4, 6, 7, 8, 10],  # Qualifying grid positions
    "QualifyingTime": [87.5, 88.0, 88.5, 89.2, 87.8, 88.2, 88.7, 89.0, 89.1, 89.5],  # Lap times
    #"FastestLap": [88.1, 88.4, 88.9, 89.7, 88.0, 88.3, 88.9, 89.1, 89.2, 89.7],  # Fastest laps
    #"TimeRatio": [1.007, 1.005, 1.004, 1.005, 1.006, 1.004, 1.003, 1.002, 1.005, 1.006],  # Lap consistency
    #"SpeedTrapQualy": [323.2, 321.8, 320.5, 318.3, 322.1, 321.0, 319.7, 318.9, 319.2, 317.8],  # Top speed
    "Temperature": [30] * 10,  # Constant for the race
    "Humidity": [55] * 10,  # Constant for the race
    "Rain": [0] * 10,  # No rain
    "NumberOfRaces": [120, 340, 110, 0, 150, 180, 370, 200, 210, 50]  # Experience (Hadjar = 0)
}

# Convert to DataFrame
X_2025 = pd.DataFrame(data_2025)

# Predict race positions using trained model
y_pred = model.predict(X_2025)

# Assign predictions back to DataFrame
X_2025["PredictedRacePosition"] = y_pred

# Sort by predicted race position (ensuring ranking is unique)
X_2025 = X_2025.sort_values(by="PredictedRacePosition").reset_index(drop=True)
X_2025["Abbreviation"] = label_encoder_abbr.inverse_transform(X_2025["Abbreviation"])

print(X_2025[["Abbreviation", "PredictedRacePosition"]])


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- MeanSpeedTrap
- MeanTimeRatio
