In [1]:
# Importing all packages
import os
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import shutil

# Enable cache memory for faster access
custom_cache_folder = "f1_cache"
os.makedirs(custom_cache_folder, exist_ok=True)
fastf1.Cache.enable_cache(custom_cache_folder)

# Import driver info table containing years of experience for each driver
experience = pd.read_csv("Dataset/DriverInfo.csv")

# Import a race name - Later this will be a user input
circuit_name = "Abu Dhabi"

# Function that gets race data
def get_race_data(year, circuit):
    """Retrieves race data including fastest laps and weather information"""
    try:
        session = fastf1.get_session(year, circuit, 'R')
        session.load()
        
        weather_data = session.weather_data
        all_drivers = session.laps["Driver"].unique()
        fastest_laps = []

        for driver in all_drivers:
            try:
                driver_laps = session.laps.pick_drivers(driver)
                if not driver_laps.empty:
                    fastest_lap = driver_laps.pick_fastest()
                    if not pd.isna(fastest_lap['LapTime']):
                        fastest_lap_seconds = fastest_lap['LapTime'].total_seconds()
                        fastest_laps.append([driver, fastest_lap_seconds])
                    else:
                        fastest_laps.append([driver, None])
                else:
                    fastest_laps.append([driver, None])
            except Exception as e:
                print(f"Error processing driver {driver} in {year}: {e}")
                fastest_laps.append([driver, None])

        fastest_lap_df = pd.DataFrame(fastest_laps, columns=["Abbreviation", "FastestLap"])
        
        results = session.results[["FullName", "Abbreviation", "TeamName", "GridPosition", "Position", "Status"]]
        results["Year"] = year
        results["Humidity"] = weather_data["Humidity"].mean()
        results["Temperature"] = weather_data["AirTemp"].mean()
        results["Rain"] = weather_data["Rainfall"].mean()

        results = results.merge(fastest_lap_df, on="Abbreviation")
        return results
    
    except Exception as e:
        print(f"Error getting race data for {year} {circuit}: {e}")
        return pd.DataFrame()

# Function that gets qualifying data
def get_qualifying_data(year, circuit):
    """Retrieves qualifying time and speed trap data"""
    try:
        session = fastf1.get_session(year, circuit, 'Q')
        session.load()

        all_drivers = session.laps["Driver"].unique()
        qualifying_data = []

        for driver in all_drivers:
            try:
                driver_laps = session.laps.pick_drivers(driver)
                if not driver_laps.empty:
                    fastest_lap = driver_laps.pick_fastest()
                    # Qualifying time
                    if 'LapTime' in fastest_lap and not pd.isna(fastest_lap['LapTime']):
                        qualifying_time = fastest_lap['LapTime'].total_seconds()
                    else:
                        qualifying_time = None
                    # Speed trap
                    speed_trap = fastest_lap['SpeedI2'] if 'SpeedI2' in fastest_lap else None
                else:
                    qualifying_time = None
                    speed_trap = None
                
                qualifying_data.append([driver, qualifying_time, speed_trap])
            except Exception as e:
                print(f"Error processing driver {driver} in {year}: {e}")
                qualifying_data.append([driver, None, None])

        qualifying_df = pd.DataFrame(
            qualifying_data,
            columns=["Abbreviation", "QualifyingTime", "SpeedTrapQualy"]
        )
        
        # Get qualifying position
        results = session.results[["Abbreviation", "Position"]]
        results = results.rename(columns={"Position": "QualifyingPosition"})
        
        # Merge with qualifying performance data
        qualifying_results = results.merge(qualifying_df, on="Abbreviation")
        qualifying_results["Year"] = year
        
        return qualifying_results
    
    except Exception as e:
        print(f"Error getting qualifying data for {year} {circuit}: {e}")
        return pd.DataFrame()

# Get data for all years
race_2022 = get_race_data(2022, circuit_name)
race_2023 = get_race_data(2023, circuit_name)
race_2024 = get_race_data(2024, circuit_name)

quali_2022 = get_qualifying_data(2022, circuit_name)
quali_2023 = get_qualifying_data(2023, circuit_name)
quali_2024 = get_qualifying_data(2024, circuit_name)

# Combine all data
all_races = pd.concat([race_2022, race_2023, race_2024], ignore_index=True)
all_qualis = pd.concat([quali_2022, quali_2023, quali_2024], ignore_index=True)

# Merge race and qualifying data
merged_data = pd.merge(
    all_races,
    all_qualis,
    on=["Abbreviation", "Year"],
    how="left"
)

# Add driver experience information
final_data = merged_data.merge(experience, on=["Abbreviation"])

# Calculate TimeRatio (QualifyingTime / FastestLap)
final_data["TimeRatio"] = final_data.apply(
    lambda row: row["QualifyingTime"] / row["FastestLap"] 
    if pd.notna(row["QualifyingTime"]) and pd.notna(row["FastestLap"]) and row["FastestLap"] != 0 
    else None,
    axis=1
)

# Clean up the final DataFrame
final_data = final_data[[
    'Year', 'FullName', 'Abbreviation', 'TeamName', 
    'QualifyingPosition', 'GridPosition', 'Position',
    'QualifyingTime', 'FastestLap', 'TimeRatio', 'SpeedTrapQualy',
    'Temperature', 'Humidity', 'Rain', 'Status','NumberOfRaces'
    # Add any columns from experience.csv that you need
]]

final_data

core           INFO 	Loading data for Abu Dhabi Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '11', '55', '63', '4', '31', '18', '3', '5', '22', '24', '23', '10', '77', '47', '20', '44', '6', '14']
A value is trying to be set on a copy of a slice from a DataF

Unnamed: 0,Year,FullName,Abbreviation,TeamName,QualifyingPosition,GridPosition,Position,QualifyingTime,FastestLap,TimeRatio,SpeedTrapQualy,Temperature,Humidity,Rain,Status,NumberOfRaces
0,2022,Max Verstappen,VER,Red Bull Racing,1.0,1.0,1.0,83.824,89.392,0.937713,323.0,28.575484,61.683871,0.0,Finished,211
1,2022,Charles Leclerc,LEC,Ferrari,3.0,3.0,2.0,84.092,89.719,0.937282,318.0,28.575484,61.683871,0.0,Finished,149
2,2022,Sergio Perez,PER,Red Bull Racing,2.0,2.0,3.0,84.052,88.972,0.944702,325.0,28.575484,61.683871,0.0,Finished,281
3,2022,Carlos Sainz,SAI,Ferrari,4.0,4.0,4.0,84.242,88.879,0.947828,321.0,28.575484,61.683871,0.0,Finished,208
4,2022,George Russell,RUS,Mercedes,6.0,6.0,5.0,84.511,88.836,0.951315,319.0,28.575484,61.683871,0.0,Finished,130
5,2022,Lando Norris,NOR,McLaren,7.0,7.0,6.0,84.769,88.391,0.959023,319.0,28.575484,61.683871,0.0,Finished,130
6,2022,Esteban Ocon,OCO,Alpine,8.0,8.0,7.0,84.83,89.333,0.949593,318.0,28.575484,61.683871,0.0,Finished,158
7,2022,Lance Stroll,STR,Aston Martin,14.0,14.0,8.0,85.359,89.62,0.952455,312.0,28.575484,61.683871,0.0,Finished,168
8,2022,Daniel Ricciardo,RIC,McLaren,10.0,13.0,9.0,85.045,90.785,0.936774,314.0,28.575484,61.683871,0.0,Finished,258
9,2022,Sebastian Vettel,VET,Aston Martin,9.0,9.0,10.0,84.961,90.312,0.94075,314.0,28.575484,61.683871,0.0,Finished,300


Unnamed: 0,Year,FullName,Abbreviation,TeamName,QualifyingPosition,GridPosition,Position,QualifyingTime,FastestLap,TimeRatio,SpeedTrapQualy,Temperature,Humidity,Rain,Status
0,2022,Max Verstappen,VER,Red Bull Racing,1.0,1.0,1.0,83.824,89.392,0.937713,323.0,28.575484,61.683871,0.0,Finished
1,2023,Max Verstappen,VER,Red Bull Racing,1.0,1.0,1.0,83.445,86.993,0.959215,324.0,26.962821,51.0,0.0,Finished
2,2024,Max Verstappen,VER,Red Bull Racing,5.0,4.0,6.0,82.945,87.765,0.945081,325.0,26.768243,51.445946,0.0,Finished
3,2022,Charles Leclerc,LEC,Ferrari,3.0,3.0,2.0,84.092,89.719,0.937282,318.0,28.575484,61.683871,0.0,Finished
4,2023,Charles Leclerc,LEC,Ferrari,2.0,2.0,2.0,83.584,88.199,0.947675,322.0,26.962821,51.0,0.0,Finished
5,2024,Charles Leclerc,LEC,Ferrari,14.0,19.0,3.0,83.302,88.018,0.94642,322.0,26.768243,51.445946,0.0,Finished
6,2022,Sergio Perez,PER,Red Bull Racing,2.0,2.0,3.0,84.052,88.972,0.944702,325.0,28.575484,61.683871,0.0,Finished
7,2023,Sergio Perez,PER,Red Bull Racing,9.0,9.0,4.0,84.116,87.493,0.961403,324.0,26.962821,51.0,0.0,Finished
8,2024,Sergio Perez,PER,Red Bull Racing,10.0,10.0,20.0,83.264,,,325.0,26.768243,51.445946,0.0,Collision
9,2022,Carlos Sainz,SAI,Ferrari,4.0,4.0,4.0,84.242,88.879,0.947828,321.0,28.575484,61.683871,0.0,Finished


## Encoding string type features

In [7]:
from sklearn.preprocessing import LabelEncoder

final_sessions_encoded = final_data.copy()

label_encoder_abbr = LabelEncoder()
label_encoder_team = LabelEncoder()

final_sessions_encoded["Abbreviation"] = label_encoder_abbr.fit_transform(final_sessions_encoded["Abbreviation"])
final_sessions_encoded["TeamName"] = label_encoder_team.fit_transform(final_sessions_encoded["TeamName"])

# Mapping for Drivers
driver_mapping = dict(zip(label_encoder_abbr.classes_, label_encoder_abbr.transform(label_encoder_abbr.classes_)))
print("Driver Encoding Mapping:", driver_mapping)

# Mapping for Teams
team_mapping = dict(zip(label_encoder_team.classes_, label_encoder_team.transform(label_encoder_team.classes_)))
print("Team Encoding Mapping:", team_mapping)

# Inverse process - if you need it 
#decoded_driver = label_encoder_abbr.inverse_transform([0, 1, 2])  # Convert 0,1,2 back to names
#decoded_team = label_encoder_team.inverse_transform([0, 1, 2]) 

#print("Decoded Drivers:", decoded_driver)
#print("Decoded Teams:", decoded_team)

# Check for NaN values
print(final_sessions_encoded.isnull().sum())

# Fill NaN values with mean values in those columns
for col in ["QualifyingTime", "FastestLap", "TimeRatio", "SpeedTrapQualy"]:
    final_sessions_encoded[col] = final_sessions_encoded[col].fillna(final_sessions_encoded[col].mean())

# Verify that there are no NaN values
print(final_sessions_encoded.isnull().sum())

final_sessions_encoded


Driver Encoding Mapping: {'ALB': np.int64(0), 'ALO': np.int64(1), 'BOT': np.int64(2), 'COL': np.int64(3), 'DOO': np.int64(4), 'GAS': np.int64(5), 'HAM': np.int64(6), 'HUL': np.int64(7), 'LAT': np.int64(8), 'LAW': np.int64(9), 'LEC': np.int64(10), 'MAG': np.int64(11), 'NOR': np.int64(12), 'OCO': np.int64(13), 'PER': np.int64(14), 'PIA': np.int64(15), 'RIC': np.int64(16), 'RUS': np.int64(17), 'SAI': np.int64(18), 'SAR': np.int64(19), 'STR': np.int64(20), 'TSU': np.int64(21), 'VER': np.int64(22), 'VET': np.int64(23), 'ZHO': np.int64(24)}
Team Encoding Mapping: {'Alfa Romeo': np.int64(0), 'AlphaTauri': np.int64(1), 'Alpine': np.int64(2), 'Aston Martin': np.int64(3), 'Ferrari': np.int64(4), 'Haas F1 Team': np.int64(5), 'Kick Sauber': np.int64(6), 'McLaren': np.int64(7), 'Mercedes': np.int64(8), 'RB': np.int64(9), 'Red Bull Racing': np.int64(10), 'Williams': np.int64(11)}
Year                  0
FullName              0
Abbreviation          0
TeamName              0
QualifyingPosition    0
G

Unnamed: 0,Year,FullName,Abbreviation,TeamName,QualifyingPosition,GridPosition,Position,QualifyingTime,FastestLap,TimeRatio,SpeedTrapQualy,Temperature,Humidity,Rain,Status,NumberOfRaces
0,2022,Max Verstappen,22,10,1.0,1.0,1.0,83.824,89.392,0.937713,323.0,28.575484,61.683871,0.0,Finished,211
1,2022,Charles Leclerc,10,4,3.0,3.0,2.0,84.092,89.719,0.937282,318.0,28.575484,61.683871,0.0,Finished,149
2,2022,Sergio Perez,14,10,2.0,2.0,3.0,84.052,88.972,0.944702,325.0,28.575484,61.683871,0.0,Finished,281
3,2022,Carlos Sainz,18,4,4.0,4.0,4.0,84.242,88.879,0.947828,321.0,28.575484,61.683871,0.0,Finished,208
4,2022,George Russell,17,8,6.0,6.0,5.0,84.511,88.836,0.951315,319.0,28.575484,61.683871,0.0,Finished,130
5,2022,Lando Norris,12,7,7.0,7.0,6.0,84.769,88.391,0.959023,319.0,28.575484,61.683871,0.0,Finished,130
6,2022,Esteban Ocon,13,2,8.0,8.0,7.0,84.83,89.333,0.949593,318.0,28.575484,61.683871,0.0,Finished,158
7,2022,Lance Stroll,20,3,14.0,14.0,8.0,85.359,89.62,0.952455,312.0,28.575484,61.683871,0.0,Finished,168
8,2022,Daniel Ricciardo,16,7,10.0,13.0,9.0,85.045,90.785,0.936774,314.0,28.575484,61.683871,0.0,Finished,258
9,2022,Sebastian Vettel,23,3,9.0,9.0,10.0,84.961,90.312,0.94075,314.0,28.575484,61.683871,0.0,Finished,300


## Scale values for temperature, humidity and fastest lap

In [3]:
# Check the MAE and if it doesn't look good, try to scale down!


## Creating and training a model

In [8]:
# For a time ration feature, i assume we have to calculate the average of each drivers ratio and speed trap?
features = ['Abbreviation', 'TeamName', 'GridPosition', 'QualifyingTime', 'Temperature', 'Humidity', 'Rain', 'NumberOfRaces']

# We are predicting final race position, here we could encode Status as well?
target = 'Position'

# Elementary data sets
X = final_sessions_encoded[features]
y = final_sessions_encoded[target]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

# Training a model
model = GradientBoostingRegressor(n_estimators = 100, learning_rate = 0.1, random_state = 1)
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)
print("Mean Absolute Error : ", mean_absolute_error(y_test,y_pred))

Mean Absolute Error :  3.111354895301212


## Prediction Sample

In [13]:
# Creating a sample dataset for 2025 predictions
data_2025 = {
    "Abbreviation": [5, 22, 7, 9, 3, 4, 1, 6, 8, 0],  # Already encoded driver names
    "TeamName": [3, 1, 2, 3, 0, 0, 4, 1, 3, 2],  # Already encoded team names
    "GridPosition": [1, 3, 5, 9, 2, 4, 6, 7, 8, 10],  # Qualifying grid positions
    "QualifyingTime": [87.5, 88.0, 88.5, 89.2, 87.8, 88.2, 88.7, 89.0, 89.1, 89.5],  # Lap times
    #"FastestLap": [88.1, 88.4, 88.9, 89.7, 88.0, 88.3, 88.9, 89.1, 89.2, 89.7],  # Fastest laps
    #"TimeRatio": [1.007, 1.005, 1.004, 1.005, 1.006, 1.004, 1.003, 1.002, 1.005, 1.006],  # Lap consistency
    #"SpeedTrapQualy": [323.2, 321.8, 320.5, 318.3, 322.1, 321.0, 319.7, 318.9, 319.2, 317.8],  # Top speed
    "Temperature": [30] * 10,  # Constant for the race
    "Humidity": [55] * 10,  # Constant for the race
    "Rain": [0] * 10,  # No rain
    "NumberOfRaces": [120, 340, 110, 0, 150, 180, 370, 200, 210, 50]  # Experience (Hadjar = 0)
}

# Convert to DataFrame
X_2025 = pd.DataFrame(data_2025)

# Predict race positions using trained model
y_pred = model.predict(X_2025)

# Assign predictions back to DataFrame
X_2025["PredictedRacePosition"] = y_pred

# Sort by predicted race position (ensuring ranking is unique)
X_2025 = X_2025.sort_values(by="PredictedRacePosition").reset_index(drop=True)
X_2025["Abbreviation"] = label_encoder_abbr.inverse_transform(X_2025["Abbreviation"])

print(X_2025[["Abbreviation", "PredictedRacePosition"]])


  Abbreviation  PredictedRacePosition
0          VER               4.459653
1          GAS               7.252732
2          COL               8.246530
3          DOO               8.985866
4          HUL              12.403676
5          HAM              12.866133
6          ALO              13.745952
7          LAT              15.094777
8          LAW              15.628000
9          ALB              16.884563
