In [1]:
import pandas as pd

In [2]:
FILE_2025 = 'features_laps_2025.csv'
FILE_2024 = 'features_laps_2024.csv'
FILE_2023 = 'features_laps_2023.csv'
FILE_2022 = 'features_laps_2022.csv'


features_25 = pd.read_csv(FILE_2025, delimiter=',')
features_24 = pd.read_csv(FILE_2024, delimiter=',')
features_23 = pd.read_csv(FILE_2023, delimiter=',')
features_22 = pd.read_csv(FILE_2022, delimiter=',')

features = pd.concat([features_25, features_24, features_23, features_22], ignore_index=True)

remove_2025 = ['Austin', 'Mexico City', 'São Paulo', 'Las Vegas', 'Lusail', 'Yas Island']
features = features[~((features['Year'] == 2025) & (features['Location'].isin(remove_2025)))]

features.to_csv('features_laps_p3.csv', index=False)

In [3]:
FEATURES_FILE = "features_laps_p3.csv"
features = pd.read_csv(FEATURES_FILE, delimiter=',')

In [4]:
features['Year'].value_counts()

Year
2023    2309
2024    2285
2022    2151
2025    1679
Name: count, dtype: int64

In [5]:
features['Driver'].value_counts()

Driver
LEC    536
HAM    528
RUS    525
VER    509
SAI    500
NOR    487
ALO    446
GAS    402
OCO    397
TSU    386
ALB    379
PIA    377
PER    359
STR    358
HUL    297
BOT    288
MAG    272
ZHO    265
RIC    207
LAW    116
SAR    111
MSC     91
ANT     90
HAD     89
BEA     78
BOR     73
VET     71
COL     61
LAT     60
DEV     42
DOO     24
Name: count, dtype: int64

In [6]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8424 entries, 0 to 8423
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Driver           8424 non-null   object 
 1   Team             8424 non-null   object 
 2   LapTime          8424 non-null   float64
 3   LapNumber        8424 non-null   float64
 4   Stint            8424 non-null   float64
 5   TyreLife         8418 non-null   float64
 6   FreshTyre        8424 non-null   bool   
 7   Compound         8414 non-null   object 
 8   TrackTemp        8424 non-null   float64
 9   AirTemp          8424 non-null   float64
 10  Sector1Time      8423 non-null   float64
 11  Sector2Time      8424 non-null   float64
 12  Sector3Time      8424 non-null   float64
 13  Location         8424 non-null   object 
 14  Year             8424 non-null   int64  
 15  Country          8424 non-null   object 
 16  EventName        8424 non-null   object 
 17  EventDate     

## Encoding

In [7]:
group_cols = ['Year', 'Location', 'Driver']

features_encoded = (features.sort_values('LapTime').groupby(group_cols, as_index=False).first())
features_location_split = features_encoded.copy()

In [8]:
features_encoded['Driver'].value_counts()

Driver
ALO    88
RUS    88
VER    88
NOR    87
SAI    87
LEC    87
HAM    86
ALB    85
STR    84
TSU    84
GAS    83
OCO    83
PER    67
BOT    67
ZHO    66
PIA    66
MAG    65
HUL    64
RIC    47
SAR    34
LAW    29
MSC    21
LAT    20
COL    20
VET    20
BEA    19
ANT    18
HAD    18
BOR    18
DEV    10
DOO     7
Name: count, dtype: int64

In [9]:
features_encoded = pd.get_dummies(features_encoded, columns=['Driver','Team','Location'], drop_first=False)

In [10]:
def map_compound(row):
    if row['Compound'] == 'SOFT':
        return row['Soft']
    elif row['Compound'] == 'MEDIUM':
        return row['Medium']
    elif row['Compound'] == 'HARD':
        return row['Hard']
    else:
        return None
    
features_encoded['Compound'] = features_encoded.apply(map_compound, axis=1)

compound_map = {'C6' : 6, 'C5' : 5, 'C4' : 4, 'C3' : 3, 'C2' : 2, 'C1' : 1} # wet = 0
features_encoded['Compound'] = features_encoded['Compound'].map(compound_map).fillna(0)

In [11]:
features_encoded = features_encoded.drop(columns=['Country','EventDate','EventName','Soft','Medium','Hard'])

In [12]:
# fixing Nan values for used tyres
features_encoded.loc[features_encoded['TyreLife'].isna() & (features_encoded['FreshTyre'] == True), 'TyreLife'] = 2.0
features_encoded.loc[features_encoded['TyreLife'].isna() & (features_encoded['FreshTyre'] == False), 'TyreLife'] = 3.0

In [13]:
features_encoded = features_encoded.drop(columns=['RaceDistance', 'NumLaps', 'LapNumber',
                                                  'TyreLife', 'FreshTyre', 'Stint',
                                                  'Sector1Time', 'Sector2Time', 'Sector3Time'])

In [14]:
features_encoded.to_csv('features_laps_encoded_p3.csv', index=False)

In [15]:
FEATURES_ENCODED_FILE = "features_laps_encoded_p3.csv"
features_encoded = pd.read_csv(FEATURES_ENCODED_FILE, delimiter=',')

In [16]:
features_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1706 entries, 0 to 1705
Data columns (total 83 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Year                        1706 non-null   int64  
 1   LapTime                     1706 non-null   float64
 2   Compound                    1706 non-null   float64
 3   TrackTemp                   1706 non-null   float64
 4   AirTemp                     1706 non-null   float64
 5   TrackLenght                 1706 non-null   float64
 6   NumTurns                    1706 non-null   int64  
 7   Taction                     1706 non-null   int64  
 8   AsphaltGrip                 1706 non-null   int64  
 9   AsphaltAbrasion             1706 non-null   int64  
 10  TrackEvolution              1706 non-null   int64  
 11  TyreStress                  1706 non-null   int64  
 12  Braking                     1706 non-null   int64  
 13  LateralLoad                 1706 

## Training the model

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

X = features_encoded.drop(columns=['LapTime'])
y = features_encoded['LapTime']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAE: {mae:.3f} seconds")
print(f"RMSE: {rmse:.3f} seconds")

MAE: 0.431 seconds
RMSE: 1.012 seconds


In [23]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

gbr = GradientBoostingRegressor(random_state=42)

param_grid = {
    'n_estimators': [200, 400, 800],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3],
    'max_features': [None, 0.8, 'sqrt']
}

grid = GridSearchCV(
    gbr, 
    param_grid, 
    scoring='neg_mean_absolute_error', 
    cv=3, 
    n_jobs=-1, 
    verbose=2
    )

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best MAE:", -grid.best_score_)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best params: {'learning_rate': 0.05, 'max_depth': 7, 'max_features': 0.8, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 800}
Best MAE: 0.40442137805641815


In [22]:
from sklearn.ensemble import GradientBoostingRegressor
 
best_gbr = GradientBoostingRegressor(
    n_estimators=800, 
    learning_rate=0.05, 
    max_depth=7,
    max_features=0.8,
    min_samples_leaf=3,
    min_samples_split=2,
    random_state=42
    )

best_gbr.fit(X_train, y_train)
preds_gbr = best_gbr.predict(X_test)

mae = mean_absolute_error(y_test, preds_gbr)
rmse = np.sqrt(mean_squared_error(y_test, preds_gbr))
print(f"MAE: {mae:.3f} seconds")
print(f"RMSE: {rmse:.3f} seconds")

MAE: 0.373 seconds
RMSE: 1.032 seconds


In [30]:
circuits = pd.read_csv('f1_unique_circuits_complete.csv', delimiter=',')

first_few_events_2025 = circuits['Location'][0:10]

In [31]:
first_few_events_2025

0    Melbourne
1     Shanghai
2       Suzuka
3       Sakhir
4       Jeddah
5        Miami
6        Imola
7       Monaco
8    Barcelona
9     Montréal
Name: Location, dtype: object

In [32]:
train_mask = (
    (features_location_split['Year'] < 2025) |
    ((features_location_split['Year'] == 2025) & features_location_split['Location'].isin(first_few_events_2025))
)
test_mask = (
    (features_location_split['Year'] == 2025) &
    (~features_location_split['Location'].isin(first_few_events_2025))
)

train_idx = features_location_split[train_mask].index
test_idx = features_location_split[test_mask].index

X_train = features_encoded.loc[train_idx].drop(columns=['LapTime'])
y_train = features_encoded.loc[train_idx, 'LapTime']

X_test = features_encoded.loc[test_idx].drop(columns=['LapTime'])
y_test = features_encoded.loc[test_idx, 'LapTime']

In [33]:
from sklearn.ensemble import GradientBoostingRegressor
 
best_gbr = GradientBoostingRegressor(
    n_estimators=800, 
    learning_rate=0.05, 
    max_depth=7,
    max_features=0.8,
    min_samples_leaf=3,
    min_samples_split=2,
    random_state=42
    )

best_gbr.fit(X_train, y_train)
preds_gbr = best_gbr.predict(X_test)

mae = mean_absolute_error(y_test, preds_gbr)
rmse = np.sqrt(mean_squared_error(y_test, preds_gbr))
print(f"MAE: {mae:.3f} seconds")
print(f"RMSE: {rmse:.3f} seconds")

MAE: 2.163 seconds
RMSE: 2.864 seconds


In [20]:
X_train

Unnamed: 0,Year,Compound,TrackTemp,AirTemp,TrackLenght,NumTurns,Taction,AsphaltGrip,AsphaltAbrasion,TrackEvolution,...,Location_Monza,Location_Sakhir,Location_Shanghai,Location_Silverstone,Location_Spa-Francorchamps,Location_Spielberg,Location_Suzuka,Location_São Paulo,Location_Yas Island,Location_Zandvoort
0,2022,4.0,35.4,29.9,5.513,20,4,3,3,4,...,False,False,False,False,False,False,False,False,False,False
1,2022,4.0,33.4,29.3,5.513,20,4,3,3,4,...,False,False,False,False,False,False,False,False,False,False
2,2022,4.0,33.4,29.3,5.513,20,4,3,3,4,...,False,False,False,False,False,False,False,False,False,False
3,2022,4.0,36.7,30.6,5.513,20,4,3,3,4,...,False,False,False,False,False,False,False,False,False,False
4,2022,4.0,34.0,29.4,5.513,20,4,3,3,4,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1682,2025,3.0,29.5,14.5,5.807,18,3,4,4,3,...,False,False,False,False,False,False,True,False,False,False
1683,2025,3.0,30.6,14.6,5.807,18,3,4,4,3,...,False,False,False,False,False,False,True,False,False,False
1684,2025,3.0,33.7,14.7,5.807,18,3,4,4,3,...,False,False,False,False,False,False,True,False,False,False
1685,2025,3.0,33.7,14.9,5.807,18,3,4,4,3,...,False,False,False,False,False,False,True,False,False,False


In [29]:
y_pred

array([ 84.55289,  82.98865, 101.79341, 102.09829, 101.1839 ,  91.71674,
        72.91264,  77.74745,  90.52327, 101.99858,  71.7367 , 107.6929 ,
        70.34227,  88.07923,  76.25203,  64.88269, 102.16126, 105.17609,
        77.19564, 102.82082,  86.51141,  88.75086,  72.3912 ,  79.23575,
        65.37907,  91.11892,  92.08934,  78.79706, 106.31169,  94.07677,
        82.99159,  83.2516 ,  90.50012,  66.6553 , 102.80741,  71.77241,
        65.35041,  90.30679,  92.38309,  77.64554,  71.04551,  75.85711,
        65.3742 ,  88.05024,  80.67657,  83.08602,  89.926  ,  90.04894,
        66.10457,  89.63409,  83.34135, 102.06653,  72.80861,  79.95425,
        89.65083,  87.66606,  77.97916,  88.35653,  82.56416,  79.65562,
        71.89484,  91.14497,  71.60348,  91.35939,  93.97254,  86.81627,
        71.90782,  72.28482,  88.11514,  83.70541,  78.65135,  91.56559,
        65.55883,  77.79079,  80.86677,  77.78155,  89.51127,  71.63625,
        88.80124,  77.75422,  91.54082,  65.15942, 

In [28]:
pred_df = X_test.copy()  # copy the features used for prediction
pred_df['ActualLapTime'] = y_test.values
pred_df['PredictedLapTime'] = y_pred
pred_df['Driver'] = features.loc[X_test.index, 'Driver']
pred_df['Team'] = features.loc[X_test.index, 'Team']
pred_df['Location'] = features.loc[X_test.index, 'Location']


cols_to_keep = ['Driver', 'Team', 'LapNumber', 'Stint', 'Compound', 
                'PredictedLapTime', 'ActualLapTime', 'Location', 'Year']
pred_df = pred_df[cols_to_keep]

ValueError: Length of values (342) does not match length of index (258)