In [1]:
# %%
import os
import time
from contextlib import contextmanager
from typing import Dict, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
)

import torch

# RelBench
from relbench.datasets import get_dataset
from relbench.tasks import get_task

# TabPFN
from tabpfn import TabPFNClassifier, TabPFNRegressor

# Device preference
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"

print(f"Using device: {DEVICE}")


Using device: mps


In [2]:
# %%
@contextmanager
def elapsed_timer():
    start = time.perf_counter()
    yield lambda: time.perf_counter() - start


def classification_metrics(y_true, y_pred, y_prob=None) -> Dict[str, float]:
    out = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
    }
    if y_prob is not None:
        try:
            out["roc_auc"] = roc_auc_score(y_true, y_prob)
        except Exception:
            out["roc_auc"] = np.nan
    else:
        out["roc_auc"] = np.nan
    return out


def regression_metrics(y_true, y_pred) -> Dict[str, float]:
    return {
        "mae": mean_absolute_error(y_true, y_pred),
        "mse": mean_squared_error(y_true, y_pred),
    }


In [3]:
# %%
dataset = get_dataset("rel-f1")
db = dataset.get_db()

def to_pandas(table):
    if hasattr(table, "to_pandas"):
        return table.to_pandas()
    if hasattr(table, "df"):
        return table.df
    raise ValueError("Unknown table type")

# Load key tables
tables = {}
for name in db.table_dict:
    tables[name] = to_pandas(db.table_dict[name])

for t in tables.values():
    for col in t.columns:
        if "date" in col.lower():
            t[col] = pd.to_datetime(t[col], errors="coerce")


Loading Database object from /Users/michaelflppv/Library/Caches/relbench/rel-f1/db...
Done in 0.05 seconds.


In [4]:
def run_single_table_experiment(task_name: str):
    task = get_task("rel-f1", task_name)

    train_table = task.get_table("train")
    val_table = task.get_table("val")
    test_table = task.get_table("test", mask_input_cols=False)

    df = train_table.df
    # Limit samples to 500
    df = df.sample(n=min(500, len(df)), random_state=42)
    X_train = df.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_train = df[task.target_col]

    df = val_table.df
    df = df.sample(n=min(500, len(df)), random_state=42)
    X_val = df.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_val = df[task.target_col]

    df = test_table.df
    df = df.sample(n=min(500, len(df)), random_state=42)
    X_test = df.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_test = df[task.target_col]

    if task_name == "driver-position":
        model = TabPFNRegressor(device=DEVICE)
        metric_fn = regression_metrics
        prob_val = prob_test = None
    else:
        model = TabPFNClassifier(device=DEVICE, ignore_pretraining_limits=True)
        metric_fn = classification_metrics

    with elapsed_timer() as t:
        model.fit(X_train, y_train)
    fit_time = t()

    with elapsed_timer() as t:
        y_val_pred = model.predict(X_val)
    pred_time_val = t()

    with elapsed_timer() as t:
        y_test_pred = model.predict(X_test)
    pred_time_test = t()

    if task_name != "driver-position":
        try:
            prob_val  = model.predict_proba(X_val)[:, 1]
            prob_test = model.predict_proba(X_test)[:, 1]
        except Exception:
            prob_val = prob_test = None

    res = {
        "val": {
            **metric_fn(y_val, y_val_pred, prob_val),
            "fit_time": fit_time,
            "predict_time": pred_time_val,
            "primary_metric_relbench": task.evaluate(prob_val if prob_val is not None else y_val_pred, "val")
        },
        "test": {
            **metric_fn(y_test, y_test_pred, prob_test),
            "fit_time": fit_time,
            "predict_time": pred_time_test,
            "primary_metric_relbench": task.evaluate(prob_test if prob_test is not None else y_test_pred)
        }
    }
    return res

In [5]:
# Python
def engineer_driver_features():
    results = tables["results"].merge(
        tables["races"][["raceId", "date"]],
        on="raceId", how="left"
    )

    print("results columns after merge:", results.columns.tolist())
    print("Number of rows after merge:", results.shape[0])
    results = results.dropna(subset=["driverId", "date_y"])
    print("Number of rows after dropna:", results.shape[0])

    results["dnf_flag"] = (~results["positionOrder"].isna()).astype(int)
    feats = results.groupby("driverId").expanding().agg({
        "positionOrder": "mean",
        "points": "mean",
        "dnf_flag": "mean",
        "laps": "mean"
    }).reset_index()
    feats = feats.rename(columns={
        "positionOrder": "avg_position",
        "points": "avg_points",
        "dnf_flag": "dnf_rate",
        "laps": "avg_laps"
    })
    feats["date"] = results["date_y"].values
    return feats

driver_feats = engineer_driver_features()


results columns after merge: ['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionOrder', 'points', 'laps', 'milliseconds', 'fastestLap', 'rank', 'statusId', 'date_x', 'date_y']
Number of rows after merge: 20323
Number of rows after dropna: 20323


In [6]:
# %%
def run_merged_table_experiment(task_name: str):
    task = get_task("rel-f1", task_name)

    train_table = task.get_table("train")
    val_table = task.get_table("val")
    test_table = task.get_table("test", mask_input_cols=False)

    df = train_table.df
    df = df.sample(n=min(500, len(df)), random_state=42)
    X_train = df.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_train = df[task.target_col]

    df = val_table.df
    df = df.sample(n=min(500, len(df)), random_state=42)
    X_val = df.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_val = df[task.target_col]

    df = test_table.df
    df = df.sample(n=min(500, len(df)), random_state=42)
    X_test = df.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_test = df[task.target_col]

    idx_train = task.get_index("train")  # (entity_id, date)
    idx_val   = task.get_index("val")
    idx_test  = task.get_index("test")

    # Merge features by driverId and date (backward asof join)
    def enrich(X, idx):
        df = X.copy()
        idx_df = pd.DataFrame(idx, columns=["driverId", "date"])
        idx_df["date"] = pd.to_datetime(idx_df["date"])
        merged = pd.merge_asof(
            idx_df.sort_values("date"),
            driver_feats.sort_values("date"),
            on="date", by="driverId",
            direction="backward", tolerance=pd.Timedelta("36500D")
        )
        merged = merged.drop(columns=["driverId", "date"])
        df = pd.concat([df.reset_index(drop=True), merged.reset_index(drop=True)], axis=1)
        return df

    X_train_en = enrich(X_train, idx_train)
    X_val_en   = enrich(X_val, idx_val)
    X_test_en  = enrich(X_test, idx_test)

    if task_name == "driver-position":
        model = TabPFNRegressor(device=DEVICE)
        metric_fn = regression_metrics
        prob_val = prob_test = None
    else:
        model = TabPFNClassifier(device=DEVICE)
        metric_fn = classification_metrics

    with elapsed_timer() as t:
        model.fit(X_train_en, y_train)
    fit_time = t()

    with elapsed_timer() as t:
        y_val_pred = model.predict(X_val_en)
    pred_time_val = t()

    with elapsed_timer() as t:
        y_test_pred = model.predict(X_test_en)
    pred_time_test = t()

    if task_name != "driver-position":
        try:
            prob_val  = model.predict_proba(X_val_en)[:, 1]
            prob_test = model.predict_proba(X_test_en)[:, 1]
        except Exception:
            prob_val = prob_test = None

    res = {
        "val": {
            **metric_fn(y_val, y_val_pred, prob_val),
            "fit_time": fit_time,
            "predict_time": pred_time_val,
            "primary_metric_relbench": task.evaluate(prob_val if prob_val is not None else y_val_pred, "val")
        },
        "test": {
            **metric_fn(y_test, y_test_pred, prob_test),
            "fit_time": fit_time,
            "predict_time": pred_time_test,
            "primary_metric_relbench": task.evaluate(prob_test if prob_test is not None else y_test_pred)
        }
    }
    return res


In [7]:
# %%
TASKS = ["driver-dnf", "driver-top3", "driver-position"]

all_results = []

for task in TASKS:
    print(f"=== {task} | Single Table ===")
    res_single = run_single_table_experiment(task)
    for split, metrics in res_single.items():
        all_results.append({"task": task, "setting": "single", "split": split, **metrics})

    print(f"=== {task} | Merged Table ===")
    res_merged = run_merged_table_experiment(task)
    for split, metrics in res_merged.items():
        all_results.append({"task": task, "setting": "merged", "split": split, **metrics})

results_df = pd.DataFrame(all_results)
results_df


=== driver-dnf | Single Table ===


AttributeError: 'str' object has no attribute 'df'

In [None]:
# %%
def plot_metric(metric):
    sub = results_df[(results_df["split"] == "test") & results_df[metric].notna()]
    if sub.empty:
        return
    pivot = sub.pivot(index="task", columns="setting", values=metric)
    pivot.plot(kind="bar", figsize=(8, 4), title=metric)
    plt.grid(True, axis="y")
    plt.show()

for m in ["roc_auc", "f1_macro", "mae", "mse", "fit_time", "predict_time"]:
    plot_metric(m)
