# First Model training

## Imports

In [71]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd

DATA_PATH = 'data/grandprix_features.csv'

In [72]:
# read csv
df = pd.read_csv(DATA_PATH)
print(df.head())
print(df.dtypes)

   year  round               event driver          team  quali_position  \
0  2022      1  Bahrain Grand Prix    LEC       Ferrari               1   
1  2022      1  Bahrain Grand Prix    SAI       Ferrari               3   
2  2022      1  Bahrain Grand Prix    HAM      Mercedes               5   
3  2022      1  Bahrain Grand Prix    RUS      Mercedes               9   
4  2022      1  Bahrain Grand Prix    MAG  Haas F1 Team               7   

   avg_race_lap_time_s  finish_position  points_awarded  prev_points_total  \
0            97.604208              1.0            26.0                0.0   
1            98.079957              2.0            18.0                0.0   
2            98.266244              3.0            15.0                0.0   
3            98.639022              4.0            12.0                0.0   
4            98.852833              5.0            10.0                0.0   

   scored_points  
0              1  
1              1  
2              1  
3   

In [73]:
# Datenverteilung untersuchen
def inspect_csv(path: str):
    df = pd.read_csv(path)
    print(df.head(3))
    print("Driver value counts:")
    print(df["driver"].value_counts())

inspect_csv(DATA_PATH)

   year  round               event driver      team  quali_position  \
0  2022      1  Bahrain Grand Prix    LEC   Ferrari               1   
1  2022      1  Bahrain Grand Prix    SAI   Ferrari               3   
2  2022      1  Bahrain Grand Prix    HAM  Mercedes               5   

   avg_race_lap_time_s  finish_position  points_awarded  prev_points_total  \
0            97.604208              1.0            26.0                0.0   
1            98.079957              2.0            18.0                0.0   
2            98.266244              3.0            15.0                0.0   

   scored_points  
0              1  
1              1  
2              1  
Driver value counts:
driver
LEC    68
HAM    68
RUS    68
BOT    68
STR    68
TSU    68
ZHO    68
ALO    68
PER    68
VER    68
GAS    68
NOR    68
ALB    67
SAI    67
OCO    67
MAG    66
HUL    48
RIC    47
PIA    46
SAR    36
MSC    22
LAT    22
VET    20
DEV    11
LAW    11
COL     9
BEA     3
DOO     1
Name: count, dtype

In [74]:
# Fahrer, Teams, Events in Indizes umwandeln
drivers = sorted(df["driver"].unique())
teams = sorted(df["team"].unique())
events = sorted(df["event"].unique())

driver_to_idx = {d: i for i, d in enumerate(drivers)}
team_to_idx   = {t: i for i, t in enumerate(teams)}
event_to_idx  = {e: i for i, e in enumerate(events)}

# Umkehrung der Mappings
idx_to_driver = {idx: drv for drv, idx in driver_to_idx.items()}
idx_to_team   = {idx: tm for tm, idx in team_to_idx.items()}
idx_to_event  = {idx: ev for ev, idx in event_to_idx.items()}

print(f"Drivers: {driver_to_idx}")
print(f"Teams: {team_to_idx}")
print(f"Events: {event_to_idx}")

Drivers: {'ALB': 0, 'ALO': 1, 'BEA': 2, 'BOT': 3, 'COL': 4, 'DEV': 5, 'DOO': 6, 'GAS': 7, 'HAM': 8, 'HUL': 9, 'LAT': 10, 'LAW': 11, 'LEC': 12, 'MAG': 13, 'MSC': 14, 'NOR': 15, 'OCO': 16, 'PER': 17, 'PIA': 18, 'RIC': 19, 'RUS': 20, 'SAI': 21, 'SAR': 22, 'STR': 23, 'TSU': 24, 'VER': 25, 'VET': 26, 'ZHO': 27}
Teams: {'Alfa Romeo': 0, 'AlphaTauri': 1, 'Alpine': 2, 'Aston Martin': 3, 'Ferrari': 4, 'Haas F1 Team': 5, 'Kick Sauber': 6, 'McLaren': 7, 'Mercedes': 8, 'RB': 9, 'Red Bull Racing': 10, 'Williams': 11}
Events: {'Abu Dhabi Grand Prix': 0, 'Australian Grand Prix': 1, 'Austrian Grand Prix': 2, 'Azerbaijan Grand Prix': 3, 'Bahrain Grand Prix': 4, 'Belgian Grand Prix': 5, 'British Grand Prix': 6, 'Canadian Grand Prix': 7, 'Chinese Grand Prix': 8, 'Dutch Grand Prix': 9, 'Emilia Romagna Grand Prix': 10, 'French Grand Prix': 11, 'Hungarian Grand Prix': 12, 'Italian Grand Prix': 13, 'Japanese Grand Prix': 14, 'Las Vegas Grand Prix': 15, 'Mexico City Grand Prix': 16, 'Miami Grand Prix': 17, 'M

## Dataset Klasse

In [75]:
class GrandPrixDataset(Dataset):
    def __init__(self, csv_path: str):
        df = pd.read_csv(csv_path)

        self.drivers = sorted(df["driver"].unique())
        self.teams = sorted(df["team"].unique())
        self.events = sorted(df["event"].unique())

        self.driver_to_idx = {d: i for i, d in enumerate(self.drivers)}
        self.team_to_idx   = {t: i for i, t in enumerate(self.teams)}
        self.event_to_idx  = {e: i for i, e in enumerate(self.events)}

        # convert to tensors
        self.years = torch.tensor(df["year"].values, dtype=torch.int64)
        self.rounds = torch.tensor(df["round"].values, dtype=torch.int64)
        self.quali_positions = torch.tensor(df["quali_position"].values, dtype=torch.int64)
        self.avg_lap_times = torch.tensor(df["avg_race_lap_time_s"].values, dtype=torch.float32)
        self.prev_points = torch.tensor(df["prev_points_total"].values, dtype=torch.float32)

        driver_ids = df["driver"].map(self.driver_to_idx).values
        team_ids = df["team"].map(self.team_to_idx).values
        event_ids = df["event"].map(self.event_to_idx).values

        self.driver_ids = torch.tensor(driver_ids, dtype=torch.int64)
        self.team_ids = torch.tensor(team_ids, dtype=torch.int64)
        self.event_ids = torch.tensor(event_ids, dtype=torch.int64)

        self.targets = torch.tensor(df["scored_points"].values, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        numeric_features = torch.tensor([
            self.years[idx],
            self.rounds[idx],
            self.quali_positions[idx],
        ], dtype=torch.float32)     

        categorical_features = torch.tensor([
            self.driver_ids[idx],
            self.team_ids[idx],
            self.event_ids[idx],
        ], dtype=torch.int64)

        x = {
            "numeric": numeric_features,
            "categorical": categorical_features,
        }

        y = self.targets[idx]
        return x, y

## Train, val, test split

In [76]:
dataset = GrandPrixDataset(DATA_PATH)

generator = torch.Generator().manual_seed(42)

train_size = int(0.6 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size


train_ds, val_ds, test_ds = torch.utils.data.random_split(
    dataset, [train_size, val_size, test_size],
    generator=generator
)

print(len(train_ds), len(val_ds), len(test_ds))

815 271 273


In [77]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=32)
test_loader  = DataLoader(test_ds, batch_size=32)

for batch in train_loader:
    x, y = batch
    print("Numeric features shape:", x["numeric"].shape)
    print("Categorical features shape:", x["categorical"].shape)
    print("Targets shape:", y.shape)
    break

Numeric features shape: torch.Size([32, 3])
Categorical features shape: torch.Size([32, 3])
Targets shape: torch.Size([32])
