In [175]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
import os

In [176]:
dir = os.getcwd()
project_root = os.path.abspath(os.path.join(dir, '..'))
data_root = os.path.join(project_root, 'data')
df = pd.read_csv(filepath_or_buffer=os.path.join(data_root, 'f1_data_cleaned.csv'))
df['Rainfall'] = df['Rainfall'].astype(int)

During the testing phase we will divide our dataset into training and testing set. For training we will take first 19 rounds, we will test on the other 5. Ultimately we will introduce Cross Validation.

In [177]:
df_train = df[df['RoundNumber'] < 20].copy()
df_test = df[df['RoundNumber'] >= 20].copy()

In [178]:
df_train

Unnamed: 0,DriverNumber,LapTime,LapNumber,SpeedST,Compound,TyreLife,Team,TrackTemp,Rainfall,Position,RoundNumber,FuelLevel,QualiBest,Target
0,1,96.830,8.0,245.0,INTERMEDIATE,8.0,Red Bull Racing,19.1,0,2.0,1,49.0,75.096,1.289416
1,4,96.278,8.0,276.0,INTERMEDIATE,8.0,McLaren,19.1,0,1.0,1,49.0,75.096,1.282066
2,81,97.251,8.0,257.0,INTERMEDIATE,8.0,McLaren,19.1,0,3.0,1,49.0,75.096,1.295022
3,1,94.315,9.0,258.0,INTERMEDIATE,9.0,Red Bull Racing,19.1,0,2.0,1,48.0,75.096,1.255926
4,4,94.638,9.0,281.0,INTERMEDIATE,9.0,McLaren,19.1,0,1.0,1,48.0,75.096,1.260227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,31,99.687,56.0,307.0,MEDIUM,32.0,Haas F1 Team,44.9,0,15.0,19,0.0,92.510,1.077581
17123,44,103.194,56.0,306.0,SOFT,27.0,Ferrari,44.9,0,4.0,19,0.0,92.510,1.115490
17124,63,99.181,56.0,308.0,SOFT,26.0,Mercedes,44.9,0,6.0,19,0.0,92.510,1.072111
17125,81,99.042,56.0,308.0,SOFT,29.0,McLaren,44.9,0,5.0,19,0.0,92.510,1.070609


Firstly I want to address the issue with recognizing drivers. We can leave it as it is, with numbers, but this would lead to inaccurate assumptions made by our model. One-hot encoding also doesn't seem like the best option (18 new columns!), so we will introduce new column: 'DriverPower'. We will calculate it by taking median of target results for every driver (only from training set). 

In [179]:
median_targets = df_train.groupby('DriverNumber')['Target'].median()
df_train.loc[:, 'DriverPower'] = df_train['DriverNumber'].map(median_targets)
df_test.loc[:, 'DriverPower'] = df_test['DriverNumber'].map(median_targets)

In [180]:
df_train

Unnamed: 0,DriverNumber,LapTime,LapNumber,SpeedST,Compound,TyreLife,Team,TrackTemp,Rainfall,Position,RoundNumber,FuelLevel,QualiBest,Target,DriverPower
0,1,96.830,8.0,245.0,INTERMEDIATE,8.0,Red Bull Racing,19.1,0,2.0,1,49.0,75.096,1.289416,1.076045
1,4,96.278,8.0,276.0,INTERMEDIATE,8.0,McLaren,19.1,0,1.0,1,49.0,75.096,1.282066,1.076761
2,81,97.251,8.0,257.0,INTERMEDIATE,8.0,McLaren,19.1,0,3.0,1,49.0,75.096,1.295022,1.077154
3,1,94.315,9.0,258.0,INTERMEDIATE,9.0,Red Bull Racing,19.1,0,2.0,1,48.0,75.096,1.255926,1.076045
4,4,94.638,9.0,281.0,INTERMEDIATE,9.0,McLaren,19.1,0,1.0,1,48.0,75.096,1.260227,1.076761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,31,99.687,56.0,307.0,MEDIUM,32.0,Haas F1 Team,44.9,0,15.0,19,0.0,92.510,1.077581,1.096348
17123,44,103.194,56.0,306.0,SOFT,27.0,Ferrari,44.9,0,4.0,19,0.0,92.510,1.115490,1.081338
17124,63,99.181,56.0,308.0,SOFT,26.0,Mercedes,44.9,0,6.0,19,0.0,92.510,1.072111,1.082307
17125,81,99.042,56.0,308.0,SOFT,29.0,McLaren,44.9,0,5.0,19,0.0,92.510,1.070609,1.077154


Let's do the same thing for teams.

In [181]:
teams_median = df_train.groupby('Team')['Target'].median()
df_train.loc[:, 'TeamPace'] = df_train['Team'].map(teams_median)
df_test.loc[:, 'TeamPace'] = df_test['Team'].map(teams_median)
df_train = df_train.drop(columns='Team')
df_test = df_test.drop(columns='Team')

In [182]:
cat = ['Compound']
df_train.drop(columns=cat).corr()['Target'].sort_values(ascending=False)

Target          1.000000
Rainfall        0.368297
FuelLevel       0.319082
DriverPower     0.166411
TeamPace        0.147213
Position        0.133670
LapTime         0.012853
DriverNumber    0.004130
TyreLife       -0.037994
TrackTemp      -0.118999
RoundNumber    -0.155314
LapNumber      -0.246556
QualiBest      -0.269284
SpeedST        -0.356408
Name: Target, dtype: float64

Now, let's see if our score has improved

In [183]:
df_train.columns

Index(['DriverNumber', 'LapTime', 'LapNumber', 'SpeedST', 'Compound',
       'TyreLife', 'TrackTemp', 'Rainfall', 'Position', 'RoundNumber',
       'FuelLevel', 'QualiBest', 'Target', 'DriverPower', 'TeamPace'],
      dtype='object')

In [184]:
df_train_eval = pd.get_dummies(df_train, columns=['Compound'])
df_test_eval = pd.get_dummies(df_test, columns=['Compound'])

In [185]:
col_to_drop = ['LapTime', 'DriverNumber', 'RoundNumber', 'LapNumber', 'Target', 'QualiBest']
X_train = df_train_eval.drop(columns=col_to_drop)
X_test = df_test_eval.drop(columns=col_to_drop)
y_train = df_train_eval['Target']
y_test = df_test_eval['Target']
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [186]:
model = RandomForestRegressor(
    n_estimators=100, 
    min_samples_leaf=10,
    max_depth=15,
    n_jobs=-1,
    max_features='sqrt',
    random_state=42
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = y_pred * df_test_eval['QualiBest']
mean_absolute_error(df_test_eval['LapTime'], y_pred)

1.0328180963740117

Next we must address track diversity. Different tracks influence tyres differently, and have different length. We've prepared the dataset describing all tracks in 2025 calendar. For each track we have its length, what hardness are the different compounds (1-6) and columns: Traction, Abrasion, TrackEvolution, TyreStress, Lateral, Downforce. These values are important to the model, as they allow the model to understand differences in tyre degredation and speed on different tracks. Data was taken from official Pirelli information.

In [187]:
tracks = pd.read_csv(filepath_or_buffer=os.path.join(data_root, 'track_metadata_2025.csv'))
tracks = tracks.drop(columns='Location')

In [188]:
tyres = tracks[['RoundNumber', 'C_SOFT', 'C_MEDIUM', 'C_HARD']]
tyres

Unnamed: 0,RoundNumber,C_SOFT,C_MEDIUM,C_HARD
0,1,5,4,3
1,2,4,3,2
2,3,3,2,1
3,4,3,2,1
4,5,5,4,3
5,6,5,4,3
6,7,6,5,4
7,8,6,5,4
8,9,3,2,1
9,10,6,5,4


In [189]:
df_train['C_INTERMEDIATE'] = (df_train['Compound'] == 'INTERMEDIATE').astype(int)
df_test['C_INTERMEDIATE'] = (df_test['Compound'] == 'INTERMEDIATE').astype(int)

In [190]:
df_train = pd.merge(df_train, tracks, how='left', on='RoundNumber')
df_test = pd.merge(df_test, tracks, how='left', on='RoundNumber')

In [191]:
df_train

Unnamed: 0,DriverNumber,LapTime,LapNumber,SpeedST,Compound,TyreLife,TrackTemp,Rainfall,Position,RoundNumber,...,Length,Traction,Abrasion,TrackEvolution,TyreStress,Lateral,Downforce,C_HARD,C_MEDIUM,C_SOFT
0,1,96.830,8.0,245.0,INTERMEDIATE,8.0,19.1,0,2.0,1,...,5278,2,2,3,3,3,3,3,4,5
1,4,96.278,8.0,276.0,INTERMEDIATE,8.0,19.1,0,1.0,1,...,5278,2,2,3,3,3,3,3,4,5
2,81,97.251,8.0,257.0,INTERMEDIATE,8.0,19.1,0,3.0,1,...,5278,2,2,3,3,3,3,3,4,5
3,1,94.315,9.0,258.0,INTERMEDIATE,9.0,19.1,0,2.0,1,...,5278,2,2,3,3,3,3,3,4,5
4,4,94.638,9.0,281.0,INTERMEDIATE,9.0,19.1,0,1.0,1,...,5278,2,2,3,3,3,3,3,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,31,99.687,56.0,307.0,MEDIUM,32.0,44.9,0,15.0,19,...,5513,4,3,4,4,4,4,1,3,4
17123,44,103.194,56.0,306.0,SOFT,27.0,44.9,0,4.0,19,...,5513,4,3,4,4,4,4,1,3,4
17124,63,99.181,56.0,308.0,SOFT,26.0,44.9,0,6.0,19,...,5513,4,3,4,4,4,4,1,3,4
17125,81,99.042,56.0,308.0,SOFT,29.0,44.9,0,5.0,19,...,5513,4,3,4,4,4,4,1,3,4


In [192]:
def map_compound(df_to_map):
    df = df_to_map.copy()
    conditions = [
        df['Compound'] == 'SOFT',
        df['Compound'] == 'MEDIUM',
        df['Compound'] == 'HARD'
    ]

    choices = [
        df['C_SOFT'],
        df['C_MEDIUM'],
        df['C_HARD']
    ]

    df['CompoundRating'] = np.select(conditions, choices, default=0)
    return df['CompoundRating']

df_train['CompoundRating'] = map_compound(df_train)
df_test['CompoundRating'] = map_compound(df_test)
df_train


Unnamed: 0,DriverNumber,LapTime,LapNumber,SpeedST,Compound,TyreLife,TrackTemp,Rainfall,Position,RoundNumber,...,Traction,Abrasion,TrackEvolution,TyreStress,Lateral,Downforce,C_HARD,C_MEDIUM,C_SOFT,CompoundRating
0,1,96.830,8.0,245.0,INTERMEDIATE,8.0,19.1,0,2.0,1,...,2,2,3,3,3,3,3,4,5,0
1,4,96.278,8.0,276.0,INTERMEDIATE,8.0,19.1,0,1.0,1,...,2,2,3,3,3,3,3,4,5,0
2,81,97.251,8.0,257.0,INTERMEDIATE,8.0,19.1,0,3.0,1,...,2,2,3,3,3,3,3,4,5,0
3,1,94.315,9.0,258.0,INTERMEDIATE,9.0,19.1,0,2.0,1,...,2,2,3,3,3,3,3,4,5,0
4,4,94.638,9.0,281.0,INTERMEDIATE,9.0,19.1,0,1.0,1,...,2,2,3,3,3,3,3,4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,31,99.687,56.0,307.0,MEDIUM,32.0,44.9,0,15.0,19,...,4,3,4,4,4,4,1,3,4,3
17123,44,103.194,56.0,306.0,SOFT,27.0,44.9,0,4.0,19,...,4,3,4,4,4,4,1,3,4,4
17124,63,99.181,56.0,308.0,SOFT,26.0,44.9,0,6.0,19,...,4,3,4,4,4,4,1,3,4,4
17125,81,99.042,56.0,308.0,SOFT,29.0,44.9,0,5.0,19,...,4,3,4,4,4,4,1,3,4,4


In [193]:
df_train = df_train.drop(columns=['C_HARD', 'C_MEDIUM', 'C_SOFT', 'Compound'])
df_test = df_test.drop(columns=['C_HARD', 'C_MEDIUM', 'C_SOFT', 'Compound'])

In [194]:
df_train[['Length', 'Traction', 'Abrasion', 'TrackEvolution', 'TyreStress', 'Lateral', 'Downforce', 'CompoundRating', 'Target']].corr()['Target']

Length           -0.153204
Traction         -0.209839
Abrasion          0.083242
TrackEvolution   -0.361372
TyreStress        0.098765
Lateral           0.214855
Downforce         0.251570
CompoundRating   -0.463420
Target            1.000000
Name: Target, dtype: float64

In [195]:
df_train.corr()['Target'].sort_values(ascending=False)

Target            1.000000
C_INTERMEDIATE    0.769402
Rainfall          0.368297
FuelLevel         0.319082
Downforce         0.251570
Lateral           0.214855
DriverPower       0.166411
TeamPace          0.147213
Position          0.133670
TyreStress        0.098765
Abrasion          0.083242
LapTime           0.012853
DriverNumber      0.004130
TyreLife         -0.037994
TrackTemp        -0.118999
Length           -0.153204
RoundNumber      -0.155314
Traction         -0.209839
LapNumber        -0.246556
QualiBest        -0.269284
SpeedST          -0.356408
TrackEvolution   -0.361372
CompoundRating   -0.463420
Name: Target, dtype: float64

Again, let's see if those changes improved our score

In [196]:
df_train_eval = df_train.copy()
df_test_eval = df_test.copy()

In [197]:
col_to_drop = ['LapTime', 'DriverNumber', 'RoundNumber', 'LapNumber', 'Target', 'QualiBest']
X_train = df_train_eval.drop(columns=col_to_drop)
X_test = df_test_eval.drop(columns=col_to_drop)
y_train = df_train_eval['Target']
y_test = df_test_eval['Target']
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [198]:
model = RandomForestRegressor(
    n_estimators=100, 
    min_samples_leaf=10,
    max_depth=15,
    n_jobs=-1,
    max_features='sqrt',
    random_state=42
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = y_pred * df_test_eval['QualiBest']
mae = mean_absolute_error(df_test_eval['LapTime'], y_pred)
mape = mean_absolute_percentage_error(df_test_eval['LapTime'], y_pred)
print(f'MAE: {mae}')
print(f'MAPE: {mape}')

MAE: 0.8470736241428554
MAPE: 0.010513085062528585


Results are already promising, but let's see if we can improve that. Next step will be to add two important columns: GapAhead and GapBehind. They both can largely impact the race pace.

In [199]:
df_full = pd.read_csv(filepath_or_buffer=os.path.join(data_root, 'f1_data.csv'))

# saving position for next experiment
df_pos = df_full[['RoundNumber', 'LapNumber', 'DriverNumber', 'Position']]

In [200]:
df_full

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,TrackTemp,Rainfall,WindSpeed,RoundNumber
0,0 days 01:12:57.726000,NOR,4,0 days 00:01:57.099000,1.0,1.0,,,,0 days 00:00:20.913000,...,124,1.0,False,,False,False,19.2,False,3.9,1
1,0 days 01:12:57.726000,DOO,7,,1.0,1.0,,,,,...,124,,False,,True,False,19.2,False,3.9,1
2,0 days 01:12:57.726000,HAD,6,,1.0,1.0,,,,,...,124,,False,,True,False,19.2,False,3.9,1
3,0 days 01:12:57.726000,SAI,55,,1.0,1.0,,,,,...,124,,False,,True,False,19.2,False,3.9,1
4,0 days 01:13:00.002000,VER,1,0 days 00:01:59.392000,1.0,1.0,,,,0 days 00:00:20.705000,...,124,2.0,False,,False,False,19.2,False,3.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24221,0 days 02:25:49.246000,BOR,5,0 days 00:01:30.667000,58.0,2.0,,,0 days 00:00:17.901000,0 days 00:00:38.794000,...,1,12.0,False,,False,True,29.0,False,0.8,24
24222,0 days 02:25:50.361000,SAI,55,0 days 00:01:30.046000,58.0,2.0,,,0 days 00:00:17.814000,0 days 00:00:38.404000,...,1,13.0,False,,False,True,29.0,False,0.8,24
24223,0 days 02:25:51.997000,TSU,22,0 days 00:01:29.566000,58.0,2.0,,,0 days 00:00:17.690000,0 days 00:00:38.459000,...,1,14.0,False,,False,True,29.0,False,0.8,24
24224,0 days 02:25:52.602000,ANT,12,0 days 00:01:29.630000,58.0,2.0,,,0 days 00:00:17.848000,0 days 00:00:37.897000,...,1,15.0,False,,False,True,29.0,False,0.8,24


In [201]:
df_full = df_full[['Time', 'DriverNumber', 'LapNumber', 'RoundNumber']].copy()
df_full['Time'] = pd.to_timedelta(df_full['Time'])
df_full = df_full.sort_values(by=['RoundNumber', 'LapNumber', 'Time'])

df_full['GapAhead'] = df_full.groupby(['RoundNumber', 'LapNumber'])['Time'].diff().dt.total_seconds()
df_full['GapBehind'] = df_full.groupby(['RoundNumber', 'LapNumber'])['GapAhead'].shift(-1)

df_full['GapAhead'] = df_full.groupby(['RoundNumber', 'DriverNumber'])['GapAhead'].shift(1)
df_full['GapBehind'] = df_full.groupby(['RoundNumber', 'DriverNumber'])['GapBehind'].shift(1)

df_full['GapAhead'] = df_full['GapAhead'].fillna(100.0)
df_full['GapBehind'] = df_full['GapBehind'].fillna(100.0)

In [202]:
df_full.describe()

Unnamed: 0,Time,DriverNumber,LapNumber,RoundNumber,GapAhead,GapBehind
count,24226,24226.0,24226.0,24226.0,24226.0,24226.0
mean,0 days 01:47:42.013919095,29.469537,30.850037,12.336292,11.277579,11.21039
std,0 days 00:30:29.788011497,24.894351,18.402908,6.691798,26.070976,25.954906
min,0 days 00:56:41.841000,1.0,1.0,1.0,0.0,0.0
25%,0 days 01:22:43.178750,10.0,15.0,7.0,0.977,0.977
50%,0 days 01:46:19.063500,22.0,30.0,12.0,2.201,2.203
75%,0 days 02:10:13.817500,44.0,45.0,18.0,5.70275,5.70025
max,0 days 03:41:30.238000,87.0,78.0,24.0,116.186,116.186


In [203]:
df_train.shape

(17127, 23)

In [204]:
df_full

Unnamed: 0,Time,DriverNumber,LapNumber,RoundNumber,GapAhead,GapBehind
0,0 days 01:12:57.726000,4,1.0,1,100.000,100.000
1,0 days 01:12:57.726000,7,1.0,1,100.000,100.000
2,0 days 01:12:57.726000,6,1.0,1,100.000,100.000
3,0 days 01:12:57.726000,55,1.0,1,100.000,100.000
4,0 days 01:13:00.002000,1,1.0,1,100.000,100.000
...,...,...,...,...,...,...
24221,0 days 02:25:49.246000,5,58.0,24,4.882,0.531
24222,0 days 02:25:50.361000,55,58.0,24,1.205,2.116
24223,0 days 02:25:51.997000,22,58.0,24,2.116,0.541
24224,0 days 02:25:52.602000,12,58.0,24,0.541,1.302


In [205]:
df_full = df_full.drop(columns=['Time'])
df_train = df_train.merge(df_full, how='inner', on=['RoundNumber', 'LapNumber', 'DriverNumber'])
df_test = df_test.merge(df_full, how='inner', on=['RoundNumber', 'LapNumber', 'DriverNumber'])
df_train

Unnamed: 0,DriverNumber,LapTime,LapNumber,SpeedST,TyreLife,TrackTemp,Rainfall,Position,RoundNumber,FuelLevel,...,Length,Traction,Abrasion,TrackEvolution,TyreStress,Lateral,Downforce,CompoundRating,GapAhead,GapBehind
0,1,96.830,8.0,245.0,8.0,19.1,0,2.0,1,49.0,...,5278,2,2,3,3,3,3,0,0.402,0.635
1,4,96.278,8.0,276.0,8.0,19.1,0,1.0,1,49.0,...,5278,2,2,3,3,3,3,0,100.000,0.402
2,81,97.251,8.0,257.0,8.0,19.1,0,3.0,1,49.0,...,5278,2,2,3,3,3,3,0,0.635,0.616
3,1,94.315,9.0,258.0,9.0,19.1,0,2.0,1,48.0,...,5278,2,2,3,3,3,3,0,0.954,1.056
4,4,94.638,9.0,281.0,9.0,19.1,0,1.0,1,48.0,...,5278,2,2,3,3,3,3,0,100.000,0.954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,31,99.687,56.0,307.0,32.0,44.9,0,15.0,19,0.0,...,5513,4,3,4,4,4,4,3,1.833,10.322
17123,44,103.194,56.0,306.0,27.0,44.9,0,4.0,19,0.0,...,5513,4,3,4,4,4,4,4,9.969,5.294
17124,63,99.181,56.0,308.0,26.0,44.9,0,6.0,19,0.0,...,5513,4,3,4,4,4,4,4,3.639,19.566
17125,81,99.042,56.0,308.0,29.0,44.9,0,5.0,19,0.0,...,5513,4,3,4,4,4,4,4,5.294,3.639


Let's see if these features helped in reducing the error.

In [206]:
col_to_drop = ['LapTime', 'DriverNumber', 'RoundNumber', 'LapNumber', 'Target', 'QualiBest']
X_train = df_train.drop(columns=col_to_drop)
X_test = df_test.drop(columns=col_to_drop)
y_train = df_train['Target']
y_test = df_test['Target']
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [207]:
df_train.shape

(17127, 25)

In [208]:
(df_test_eval['QualiBest'] == df_test['QualiBest']).sum()

np.int64(4140)

In [209]:
df_test.shape

(4140, 25)

In [210]:
model = RandomForestRegressor(
    n_estimators=100, 
    min_samples_leaf=10,
    max_depth=15,
    n_jobs=-1,
    max_features='sqrt',
    random_state=42
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = y_pred * df_test['QualiBest']
mae = mean_absolute_error(df_test['LapTime'], y_pred)
mape = mean_absolute_percentage_error(df_test['LapTime'], y_pred)
print(f'MAE: {mae}')
print(f'MAPE: {mape}')

MAE: 0.7726168668886234
MAPE: 0.009581783376684397


In [211]:
df_train.to_csv(os.path.join(data_root, 'f1_train.csv'), index=False)
df_test.to_csv(os.path.join(data_root, 'f1_test.csv'), index=False)