In [1]:
# Importing all packages
import os
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import shutil

# Enable cache memory for faster access
custom_cache_folder = "f1_cache"
os.makedirs(custom_cache_folder, exist_ok=True)
fastf1.Cache.enable_cache(custom_cache_folder)

# Import driver info table containing years of experience for each driver
experience = pd.read_csv("Dataset/DriverInfo.csv")

# Import a race name - Later this will be a user input
circuit_name = "Abu Dhabi"

# Function that gets race data
def get_race_data(year, circuit):
    """Retrieves race data including fastest laps and weather information"""
    try:
        session = fastf1.get_session(year, circuit, 'R')
        session.load()
        
        weather_data = session.weather_data
        all_drivers = session.laps["Driver"].unique()
        fastest_laps = []

        for driver in all_drivers:
            try:
                driver_laps = session.laps.pick_drivers(driver)
                if not driver_laps.empty:
                    fastest_lap = driver_laps.pick_fastest()
                    if not pd.isna(fastest_lap['LapTime']):
                        fastest_lap_seconds = fastest_lap['LapTime'].total_seconds()
                        fastest_laps.append([driver, fastest_lap_seconds])
                    else:
                        fastest_laps.append([driver, None])
                else:
                    fastest_laps.append([driver, None])
            except Exception as e:
                print(f"Error processing driver {driver} in {year}: {e}")
                fastest_laps.append([driver, None])

        fastest_lap_df = pd.DataFrame(fastest_laps, columns=["Abbreviation", "FastestLap"])
        
        results = session.results[["FullName", "Abbreviation", "TeamName", "GridPosition", "Position", "Status"]]
        results["Year"] = year
        results["Humidity"] = weather_data["Humidity"].mean()
        results["Temperature"] = weather_data["AirTemp"].mean()
        results["Rain"] = weather_data["Rainfall"].mean()

        results = results.merge(fastest_lap_df, on="Abbreviation")
        return results
    
    except Exception as e:
        print(f"Error getting race data for {year} {circuit}: {e}")
        return pd.DataFrame()

# Function that gets qualifying data
def get_qualifying_data(year, circuit):
    """Retrieves qualifying time and speed trap data"""
    try:
        session = fastf1.get_session(year, circuit, 'Q')
        session.load()

        all_drivers = session.laps["Driver"].unique()
        qualifying_data = []

        for driver in all_drivers:
            try:
                driver_laps = session.laps.pick_drivers(driver)
                if not driver_laps.empty:
                    fastest_lap = driver_laps.pick_fastest()
                    # Qualifying time
                    if 'LapTime' in fastest_lap and not pd.isna(fastest_lap['LapTime']):
                        qualifying_time = fastest_lap['LapTime'].total_seconds()
                    else:
                        qualifying_time = None
                    # Speed trap
                    speed_trap = fastest_lap['SpeedI2'] if 'SpeedI2' in fastest_lap else None
                else:
                    qualifying_time = None
                    speed_trap = None
                
                qualifying_data.append([driver, qualifying_time, speed_trap])
            except Exception as e:
                print(f"Error processing driver {driver} in {year}: {e}")
                qualifying_data.append([driver, None, None])

        qualifying_df = pd.DataFrame(
            qualifying_data,
            columns=["Abbreviation", "QualifyingTime", "SpeedTrapQualy"]
        )
        
        # Get qualifying position
        results = session.results[["Abbreviation", "Position"]]
        results = results.rename(columns={"Position": "QualifyingPosition"})
        
        # Merge with qualifying performance data
        qualifying_results = results.merge(qualifying_df, on="Abbreviation")
        qualifying_results["Year"] = year
        
        return qualifying_results
    
    except Exception as e:
        print(f"Error getting qualifying data for {year} {circuit}: {e}")
        return pd.DataFrame()

# Get data for all years
race_2022 = get_race_data(2022, circuit_name)
race_2023 = get_race_data(2023, circuit_name)
race_2024 = get_race_data(2024, circuit_name)

quali_2022 = get_qualifying_data(2022, circuit_name)
quali_2023 = get_qualifying_data(2023, circuit_name)
quali_2024 = get_qualifying_data(2024, circuit_name)

# Combine all data
all_races = pd.concat([race_2022, race_2023, race_2024], ignore_index=True)
all_qualis = pd.concat([quali_2022, quali_2023, quali_2024], ignore_index=True)

# Merge race and qualifying data
merged_data = pd.merge(
    all_races,
    all_qualis,
    on=["Abbreviation", "Year"],
    how="left"
)

# Add driver experience information
final_data = merged_data.merge(experience, on=["Abbreviation"])

# Clean up the final DataFrame
final_data = final_data[[
    'Year', 'FullName', 'Abbreviation', 'NumberOfRaces','TeamName', 
    'QualifyingPosition', 'GridPosition', 'Position',
    'QualifyingTime', 'SpeedTrapQualy', 'FastestLap',
    'Temperature', 'Humidity', 'Rain', 'Status',
    
]]

final_data

core           INFO 	Loading data for Abu Dhabi Grand Prix - Race [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '16', '11', '55', '63', '4', '31', '18', '3', '5', '22', '24', '23', '10', '77', '47', '20', '44', '6', '14']
A value is trying to be set on a copy of a slice from a DataF

req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '16', '55', '44', '63', '4', '31', '5', '3', '14', '22', '47', '18', '24', '20', '10', '77', '23', '6']
core           INFO 	Loading data for Abu Dhabi Grand Prix - Qualifying [v3.4.4]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data 

Unnamed: 0,Year,FullName,Abbreviation,NumberOfRaces,TeamName,QualifyingPosition,GridPosition,Position,QualifyingTime,SpeedTrapQualy,FastestLap,Temperature,Humidity,Rain,Status
0,2022,Max Verstappen,VER,211,Red Bull Racing,1.0,1.0,1.0,83.824,323.0,89.392,28.575484,61.683871,0.0,Finished
1,2023,Max Verstappen,VER,211,Red Bull Racing,1.0,1.0,1.0,83.445,324.0,86.993,26.962821,51.0,0.0,Finished
2,2024,Max Verstappen,VER,211,Red Bull Racing,5.0,4.0,6.0,82.945,325.0,87.765,26.768243,51.445946,0.0,Finished
3,2022,Charles Leclerc,LEC,149,Ferrari,3.0,3.0,2.0,84.092,318.0,89.719,28.575484,61.683871,0.0,Finished
4,2023,Charles Leclerc,LEC,149,Ferrari,2.0,2.0,2.0,83.584,322.0,88.199,26.962821,51.0,0.0,Finished
5,2024,Charles Leclerc,LEC,149,Ferrari,14.0,19.0,3.0,83.302,322.0,88.018,26.768243,51.445946,0.0,Finished
6,2022,Sergio Perez,PER,281,Red Bull Racing,2.0,2.0,3.0,84.052,325.0,88.972,28.575484,61.683871,0.0,Finished
7,2023,Sergio Perez,PER,281,Red Bull Racing,9.0,9.0,4.0,84.116,324.0,87.493,26.962821,51.0,0.0,Finished
8,2024,Sergio Perez,PER,281,Red Bull Racing,10.0,10.0,20.0,83.264,325.0,,26.768243,51.445946,0.0,Collision
9,2022,Carlos Sainz,SAI,208,Ferrari,4.0,4.0,4.0,84.242,321.0,88.879,28.575484,61.683871,0.0,Finished


## Encoding string type features

In [2]:
from sklearn.preprocessing import LabelEncoder

final_sessions_encoded = final_data.copy()

label_encoder_abbr = LabelEncoder()
label_encoder_team = LabelEncoder()

final_sessions_encoded["Abbreviation"] = label_encoder_abbr.fit_transform(final_sessions_encoded["Abbreviation"])
final_sessions_encoded["TeamName"] = label_encoder_team.fit_transform(final_sessions_encoded["TeamName"])

# Mapping for Drivers
driver_mapping = dict(zip(label_encoder_abbr.classes_, label_encoder_abbr.transform(label_encoder_abbr.classes_)))
print("Driver Encoding Mapping:", driver_mapping)

# Mapping for Teams
team_mapping = dict(zip(label_encoder_team.classes_, label_encoder_team.transform(label_encoder_team.classes_)))
print("Team Encoding Mapping:", team_mapping)

# Inverse process - if you need it 
#decoded_driver = label_encoder_abbr.inverse_transform([0, 1, 2])  # Convert 0,1,2 back to names
#decoded_team = label_encoder_team.inverse_transform([0, 1, 2]) 

#print("Decoded Drivers:", decoded_driver)
#print("Decoded Teams:", decoded_team)

final_sessions_encoded


Driver Encoding Mapping: {'ALB': 0, 'ALO': 1, 'BOT': 2, 'COL': 3, 'DOO': 4, 'GAS': 5, 'HAM': 6, 'HUL': 7, 'LAT': 8, 'LAW': 9, 'LEC': 10, 'MAG': 11, 'NOR': 12, 'OCO': 13, 'PER': 14, 'PIA': 15, 'RIC': 16, 'RUS': 17, 'SAI': 18, 'SAR': 19, 'STR': 20, 'TSU': 21, 'VER': 22, 'VET': 23, 'ZHO': 24}
Team Encoding Mapping: {'Alfa Romeo': 0, 'AlphaTauri': 1, 'Alpine': 2, 'Aston Martin': 3, 'Ferrari': 4, 'Haas F1 Team': 5, 'Kick Sauber': 6, 'McLaren': 7, 'Mercedes': 8, 'RB': 9, 'Red Bull Racing': 10, 'Williams': 11}


Unnamed: 0,Year,FullName,Abbreviation,NumberOfRaces,TeamName,QualifyingPosition,GridPosition,Position,QualifyingTime,SpeedTrapQualy,FastestLap,Temperature,Humidity,Rain,Status
0,2022,Max Verstappen,22,211,10,1.0,1.0,1.0,83.824,323.0,89.392,28.575484,61.683871,0.0,Finished
1,2023,Max Verstappen,22,211,10,1.0,1.0,1.0,83.445,324.0,86.993,26.962821,51.0,0.0,Finished
2,2024,Max Verstappen,22,211,10,5.0,4.0,6.0,82.945,325.0,87.765,26.768243,51.445946,0.0,Finished
3,2022,Charles Leclerc,10,149,4,3.0,3.0,2.0,84.092,318.0,89.719,28.575484,61.683871,0.0,Finished
4,2023,Charles Leclerc,10,149,4,2.0,2.0,2.0,83.584,322.0,88.199,26.962821,51.0,0.0,Finished
5,2024,Charles Leclerc,10,149,4,14.0,19.0,3.0,83.302,322.0,88.018,26.768243,51.445946,0.0,Finished
6,2022,Sergio Perez,14,281,10,2.0,2.0,3.0,84.052,325.0,88.972,28.575484,61.683871,0.0,Finished
7,2023,Sergio Perez,14,281,10,9.0,9.0,4.0,84.116,324.0,87.493,26.962821,51.0,0.0,Finished
8,2024,Sergio Perez,14,281,10,10.0,10.0,20.0,83.264,325.0,,26.768243,51.445946,0.0,Collision
9,2022,Carlos Sainz,18,208,4,4.0,4.0,4.0,84.242,321.0,88.879,28.575484,61.683871,0.0,Finished


## Scale values for temperature, humidity and fastest lap

In [3]:
# Check the MAE and if it doesn't look good, try to scale down!