This notebook extends each driver and race subset to match the length of the longest subset in the respective race. This ensures smooth iteration through the data in the D3 application and prevents crashes when a driver retires early from the race.

In [1]:
import pandas as pd
import polars as pl
import fastf1 as ff1
from tqdm import tqdm

In [2]:
race_data = pd.read_parquet("../static/data/tester.parquet")

In [3]:
grouped_race_data = race_data.groupby(["round_number","year","driver_number"]).count().sort_values(by=["year","round_number","pos_index"], ascending=[True,True,False])

In [4]:
grouped_race_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LapNumber,x,y,pos_index
round_number,year,driver_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2020,6,895,25698,25698,25698
1,2020,77,916,25691,25691,25691
1,2020,5,898,25032,25032,25032
1,2020,99,901,25016,25016,25016
1,2020,31,901,24999,24999,24999
...,...,...,...,...,...,...
8,2024,16,908,33257,33257,33257
8,2024,11,10,9445,9445,9445
8,2024,20,10,9445,9445,9445
8,2024,27,10,9445,9445,9445


In [5]:
max_number_data_points_per_race = grouped_race_data.groupby(["year","round_number"])["x"].max().reset_index()

In [6]:
max_number_data_points_per_race = max_number_data_points_per_race.rename(columns={"x":"max_datapoints"})

In [7]:
max_number_data_points_per_race

Unnamed: 0,year,round_number,max_datapoints
0,2020,1,25698
1,2020,2,22949
2,2020,3,26676
3,2020,4,24330
4,2020,5,22140
...,...,...,...
85,2024,4,26576
86,2024,5,23676
87,2024,6,21133
88,2024,7,19929


Write function to append the last known datapoint until the DataFrame for the driver is as long as the longest DataFrame

In [8]:
pl_race_data = pl.from_pandas(race_data).lazy()

In [9]:
new_race_data_list = []
for gp,year, driver_number in tqdm(grouped_race_data.index):
    max_datapoints = max_number_data_points_per_race.loc[(max_number_data_points_per_race["year"] == year) & (max_number_data_points_per_race["round_number"] == gp),"max_datapoints"].item()
    filtered_race_data = pl_race_data.filter((pl.col("year")==year)&(pl.col("round_number")==gp)&(pl.col("driver_number")==driver_number)).collect().to_pandas()
    diff_max_points = max_datapoints-len(filtered_race_data)
    filtered_race_data["artifical_pos"] = False

    if diff_max_points != 0:
        df_to_add = pd.DataFrame([filtered_race_data.iloc[-1]]*diff_max_points)
        df_to_add["artifical_pos"] = True
        new_race_data_list.append(pd.concat(
            [filtered_race_data,df_to_add]
        ))
    else: 
        new_race_data_list.append(filtered_race_data)


100%|██████████| 1794/1794 [01:16<00:00, 23.45it/s]


In [10]:
modified_race_data = pd.concat(new_race_data_list)

In [11]:
modified_race_data.groupby(["round_number","year","driver_number"]).count().sort_values(by=["year","round_number","x"], ascending=[True,True,False])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LapNumber,x,y,pos_index,artifical_pos
round_number,year,driver_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2020,3,161,25698,25698,25698,25698
1,2020,4,904,25698,25698,25698,25698
1,2020,5,898,25698,25698,25698,25698
1,2020,6,895,25698,25698,25698,25698
1,2020,7,655,25698,25698,25698,25698
...,...,...,...,...,...,...,...
8,2024,44,898,33486,33486,33486,33486
8,2024,55,902,33486,33486,33486,33486
8,2024,63,910,33486,33486,33486,33486
8,2024,77,887,33486,33486,33486,33486


In [14]:
modified_race_data.ffill().to_parquet("race_data.parquet")