In [1]:
import polars as pl
from polars import col as c
import datetime as dt

In [2]:
race_data = pl.scan_parquet("./static/data/race_data.parquet")
lap_data = pl.scan_parquet("./static/data/all_laps.parquet")


In [3]:
print(race_data.schema)
print(lap_data.schema)

OrderedDict({'x': Float64, 'y': Float64, 'round_number': Int8, 'year': Int16, 'driver_number': Int8, 'data': Datetime(time_unit='ns', time_zone=None), 'pos_index': UInt16, 'artifical_pos': Boolean})
OrderedDict({'round_number': Int64, 'year': Int64, 'Time': Duration(time_unit='ns'), 'Driver': String, 'DriverNumber': String, 'LapTime': Duration(time_unit='ns'), 'LapNumber': Float64, 'Stint': Float64, 'PitOutTime': Duration(time_unit='ns'), 'PitInTime': Duration(time_unit='ns'), 'Sector1Time': Duration(time_unit='ns'), 'Sector2Time': Duration(time_unit='ns'), 'Sector3Time': Duration(time_unit='ns'), 'Sector1SessionTime': Duration(time_unit='ns'), 'Sector2SessionTime': Duration(time_unit='ns'), 'Sector3SessionTime': Duration(time_unit='ns'), 'SpeedI1': Float64, 'SpeedI2': Float64, 'SpeedFL': Float64, 'SpeedST': Float64, 'IsPersonalBest': Boolean, 'Compound': String, 'TyreLife': Float64, 'FreshTyre': Boolean, 'Team': String, 'LapStartTime': Duration(time_unit='ns'), 'LapStartDate': Datetim

In [4]:
race_data = race_data.rename({"data":"date"})
race_data = race_data.with_columns(c("date").dt.round("1s")).collect()
race_data

x,y,round_number,year,driver_number,date,pos_index,artifical_pos
f64,f64,i8,i16,i8,datetime[ns],u16,bool
1583.662599,-1055.522038,1,2020,6,2020-07-05 13:12:47,0,false
1583.662599,-1055.522038,1,2020,6,2020-07-05 13:12:48,1,false
1583.662599,-1055.522038,1,2020,6,2020-07-05 13:12:48,2,false
1583.662599,-1055.522038,1,2020,6,2020-07-05 13:12:48,3,false
1583.662599,-1055.522038,1,2020,6,2020-07-05 13:12:48,4,false
…,…,…,…,…,…,…,…
-11049.250563,-1381.68665,8,2024,31,2024-05-26 13:44:08,9444,true
-11049.250563,-1381.68665,8,2024,31,2024-05-26 13:44:08,9444,true
-11049.250563,-1381.68665,8,2024,31,2024-05-26 13:44:08,9444,true
-11049.250563,-1381.68665,8,2024,31,2024-05-26 13:44:08,9444,true


In [5]:
lap_data = lap_data.with_columns(c("year").cast(pl.Int16), c("round_number").cast(pl.Int8),c("DriverNumber").cast(pl.Int8), c("LapStartDate").dt.round("1s")).collect()
lap_data


round_number,year,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,__index_level_0__
i8,i16,duration[ns],str,i8,duration[ns],f64,f64,duration[ns],duration[ns],duration[ns],duration[ns],duration[ns],duration[ns],duration[ns],duration[ns],f64,f64,f64,f64,bool,str,f64,bool,str,duration[ns],datetime[ns],str,f64,bool,str,bool,bool,i64
1,2020,34m 6s 353ms,"""GAS""",10,1m 19s 106ms,1.0,1.0,,,,33s 352ms,23s 247ms,,33m 43s 79ms,34m 6s 511ms,309.0,217.0,274.0,290.0,false,"""MEDIUM""",1.0,true,"""AlphaTauri""",32m 47s 6ms,2020-07-05 13:12:47,"""1""",12.0,false,"""""",false,false,0
1,2020,35m 18s 765ms,"""GAS""",10,1m 12s 412ms,2.0,1.0,,,17s 960ms,32s 228ms,22s 224ms,34m 24s 262ms,34m 56s 490ms,35m 18s 714ms,282.0,226.0,270.0,292.0,true,"""MEDIUM""",2.0,true,"""AlphaTauri""",34m 6s 353ms,2020-07-05 13:14:07,"""1""",12.0,false,"""""",false,true,1
1,2020,36m 30s 76ms,"""GAS""",10,1m 11s 311ms,3.0,1.0,,,17s 513ms,31s 717ms,22s 81ms,35m 36s 227ms,36m 7s 944ms,36m 30s 25ms,307.0,224.0,277.0,311.0,true,"""MEDIUM""",3.0,true,"""AlphaTauri""",35m 18s 765ms,2020-07-05 13:15:19,"""1""",12.0,false,"""""",false,true,2
1,2020,37m 40s 801ms,"""GAS""",10,1m 10s 725ms,4.0,1.0,,,17s 410ms,31s 510ms,21s 805ms,36m 47s 435ms,37m 18s 945ms,37m 40s 750ms,309.0,218.0,275.0,309.0,true,"""MEDIUM""",4.0,true,"""AlphaTauri""",36m 30s 76ms,2020-07-05 13:16:30,"""1""",12.0,false,"""""",false,true,3
1,2020,38m 52s 312ms,"""GAS""",10,1m 11s 511ms,5.0,1.0,,,17s 464ms,31s 827ms,22s 220ms,37m 58s 214ms,38m 30s 41ms,38m 52s 261ms,303.0,219.0,268.0,296.0,false,"""MEDIUM""",5.0,true,"""AlphaTauri""",37m 40s 801ms,2020-07-05 13:17:41,"""1""",12.0,false,"""""",false,true,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
8,2024,3h 19m 16s 827ms,"""ZHO""",24,1m 17s 173ms,76.0,3.0,,,20s 460ms,35s 927ms,20s 786ms,3h 18m 20s 81ms,3h 18m 56s 8ms,3h 19m 16s 794ms,205.0,188.0,258.0,278.0,false,"""SOFT""",6.0,true,"""Kick Sauber""",3h 17m 59s 654ms,2024-05-26 15:26:05,"""1""",16.0,false,"""""",false,true,1232
8,2024,1h 36m 2s 379ms,"""OCO""",31,,1.0,1.0,,,,,,,,,,,,,false,"""HARD""",1.0,true,"""Alpine""",55m 5s 680ms,2024-05-26 13:03:11,"""1254""",3.0,false,"""""",true,false,1233
8,2024,1h 36m 2s 379ms,"""PER""",11,,1.0,1.0,,,,,,,,,,,,,false,"""HARD""",1.0,true,"""Red Bull Racin…",55m 5s 680ms,2024-05-26 13:03:11,"""1254""",2.0,false,"""""",true,false,1234
8,2024,1h 36m 2s 379ms,"""HUL""",27,,1.0,1.0,,,,,,,,,,,,,false,"""MEDIUM""",1.0,true,"""Haas F1 Team""",55m 5s 680ms,2024-05-26 13:03:11,"""1254""",4.0,false,"""""",true,false,1235


In [6]:
merged_race_data = race_data.join(lap_data,left_on=["year","round_number","date","driver_number"], right_on=["year","round_number","LapStartDate","DriverNumber"],how="left")

In [7]:
merged_race_data = merged_race_data.select(c("round_number"),c("year"),c("driver_number"),c("LapNumber"),c("x"),c("y"),c("pos_index"),c("artifical_pos"))

In [11]:
merged_race_data

round_number,year,driver_number,LapNumber,x,y,pos_index,artifical_pos
i8,i16,i8,f64,f64,f64,u16,bool
1,2020,6,1.0,1583.662599,-1055.522038,0,false
1,2020,6,1.0,1583.662599,-1055.522038,0,false
1,2020,6,1.0,1583.662599,-1055.522038,0,false
1,2020,6,1.0,1583.662599,-1055.522038,0,false
1,2020,6,1.0,1583.662599,-1055.522038,0,false
…,…,…,…,…,…,…,…
8,2024,31,,-11049.250563,-1381.68665,9444,true
8,2024,31,,-11049.250563,-1381.68665,9444,true
8,2024,31,,-11049.250563,-1381.68665,9444,true
8,2024,31,,-11049.250563,-1381.68665,9444,true


In [16]:
merged_race_data.unique(
    subset=[
        "round_number",
        "year",
        "driver_number",
        "pos_index",
    ],
    keep="first",
    maintain_order=True
).write_parquet("tester.parquet")