In [71]:
# %%
import os
import time
from contextlib import contextmanager
from typing import Dict, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    mean_absolute_error,
    #mean_squared_error,
)

import torch

# RelBench
from relbench.datasets import get_dataset
from relbench.tasks import get_task

# TabPFN
from tabpfn import TabPFNClassifier, TabPFNRegressor

# Device preference
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"

print(f"Using device: {DEVICE}")


Using device: mps


In [72]:
# %%
@contextmanager
def elapsed_timer():
    start = time.perf_counter()
    yield lambda: time.perf_counter() - start


def classification_metrics(y_true, y_pred, y_prob=None) -> Dict[str, float]:
    out = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
    }
    if y_prob is not None:
        try:
            out["roc_auc"] = roc_auc_score(y_true, y_prob)
        except Exception:
            out["roc_auc"] = np.nan
    else:
        out["roc_auc"] = np.nan
    return out


def regression_metrics(y_true, y_pred) -> Dict[str, float]:
    return {
        "mae": mean_absolute_error(y_true, y_pred),
        #"mse": mean_squared_error(y_true, y_pred),
    }


In [73]:
# %%
dataset = get_dataset("rel-f1")
db = dataset.get_db()

def to_pandas(table):
    if hasattr(table, "to_pandas"):
        return table.to_pandas()
    if hasattr(table, "df"):
        return table.df
    raise ValueError("Unknown table type")

# Load key tables
tables = {}
for name in db.table_dict:
    tables[name] = to_pandas(db.table_dict[name])

for t in tables.values():
    for col in t.columns:
        if "date" in col.lower():
            t[col] = pd.to_datetime(t[col], errors="coerce")


In [74]:
def run_single_table_experiment(task_name: str):
    global prob_val, prob_test
    task = get_task("rel-f1", task_name)

    train_table = task.get_table("train")
    val_table = task.get_table("val")
    test_table = task.get_table("test", mask_input_cols=False)

    df = train_table.df
    df = df.sample(n=min(1000, len(df)), random_state=42)
    X_train = df.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_train = df[task.target_col]

    df = val_table.df
    df = df.sample(n=min(1000, len(df)), random_state=42)
    X_val = df.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_val = df[task.target_col]

    df = test_table.df
    df = df.sample(n=min(1000, len(df)), random_state=42)
    X_test = df.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_test = df[task.target_col]

    if task_name == "driver-position":
        model = TabPFNRegressor(device=DEVICE)
        metric_fn = regression_metrics
        prob_val = prob_test = None
    else:
        model = TabPFNClassifier(device=DEVICE, ignore_pretraining_limits=True)
        metric_fn = classification_metrics

    with elapsed_timer() as t:
        model.fit(X_train, y_train)
    fit_time = t()

    with elapsed_timer() as t:
        y_val_pred = model.predict(X_val)
    pred_time_val = t()

    with elapsed_timer() as t:
        y_test_pred = model.predict(X_test)
    pred_time_test = t()

    if task_name != "driver-position":
        try:
            prob_val  = model.predict_proba(X_val)[:, 1]
            prob_test = model.predict_proba(X_test)[:, 1]
        except Exception:
            prob_val = prob_test = None

    # Get only the primary metric value (not dict)
    primary_metric_val = list(task.evaluate(
        prob_val if prob_val is not None else y_val_pred,
        target_table=task.get_table("val", mask_input_cols=False)
    ).values())[0]
    primary_metric_test = list(task.evaluate(
        prob_test if prob_test is not None else y_test_pred,
        target_table=task.get_table("test", mask_input_cols=False)
    ).values())[0]

    res = {
        "val": {
            **metric_fn(y_val, y_val_pred, prob_val),
            "fit_time": fit_time,
            "predict_time": pred_time_val,
            "primary_metric_relbench": primary_metric_val,
        },
        "test": {
            **metric_fn(y_test, y_test_pred, prob_test),
            "fit_time": fit_time,
            "predict_time": pred_time_test,
            "primary_metric_relbench": primary_metric_test,
        }
    }
    return res

In [75]:
# Python
def engineer_driver_features():
    results = tables["results"].merge(
        tables["races"][["raceId", "date"]],
        on="raceId", how="left"
    )

    print("results columns after merge:", results.columns.tolist())
    print("Number of rows after merge:", results.shape[0])
    results = results.dropna(subset=["driverId", "date_y"])
    print("Number of rows after dropna:", results.shape[0])

    results["dnf_flag"] = (~results["positionOrder"].isna()).astype(int)
    feats = results.groupby("driverId").expanding().agg({
        "positionOrder": "mean",
        "points": "mean",
        "dnf_flag": "mean",
        "laps": "mean"
    }).reset_index()
    feats = feats.rename(columns={
        "positionOrder": "avg_position",
        "points": "avg_points",
        "dnf_flag": "dnf_rate",
        "laps": "avg_laps"
    })
    feats["date"] = results["date_y"].values
    return feats

driver_feats = engineer_driver_features()


results columns after merge: ['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionOrder', 'points', 'laps', 'milliseconds', 'fastestLap', 'rank', 'statusId', 'date_x', 'date_y']
Number of rows after merge: 20323
Number of rows after dropna: 20323


In [76]:
# %%
def run_merged_table_experiment(task_name: str):
    global prob_val, prob_test
    task = get_task("rel-f1", task_name)

    train_table = task.get_table("train")
    val_table = task.get_table("val")
    test_table = task.get_table("test", mask_input_cols=False)

    df_train = train_table.df
    df_train = df_train.sample(n=min(1000, len(df_train)), random_state=42)
    idx_train = list(zip(df_train["driverId"], df_train["date"]))
    X_train = df_train.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_train = df_train[task.target_col]

    df_val = val_table.df
    df_val = df_val.sample(n=min(1000, len(df_val)), random_state=42)
    idx_val   = list(zip(df_val["driverId"], df_val["date"]))
    X_val = df_val.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_val = df_val[task.target_col]

    df_test = test_table.df
    df_test = df_test.sample(n=min(1000, len(df_test)), random_state=42)
    idx_test  = list(zip(df_test["driverId"], df_test["date"]))
    X_test = df_test.drop(columns=[task.target_col]).select_dtypes(include=[np.number])
    y_test = df_test[task.target_col]

    # Merge features by driverId and date (backward asof join)
    def enrich(X, idx):
        df = X.copy()
        idx_df = pd.DataFrame(idx, columns=["driverId", "date"])
        idx_df["driverId"] = idx_df["driverId"].astype("int64")
        idx_df["date"] = pd.to_datetime(idx_df["date"])
        driver_feats_fixed = driver_feats.copy()
        driver_feats_fixed["driverId"] = driver_feats_fixed["driverId"].astype("int64")
        merged = pd.merge_asof(
            idx_df.sort_values("date"),
            driver_feats_fixed.sort_values("date"),
            on="date", by="driverId",
            direction="backward", tolerance=pd.Timedelta("3650D")
        )
        merged = merged.drop(columns=["driverId", "date"])
        df = pd.concat([df.reset_index(drop=True), merged.reset_index(drop=True)], axis=1)
        return df

    X_train_en = enrich(X_train, idx_train)
    X_val_en   = enrich(X_val, idx_val)
    X_test_en  = enrich(X_test, idx_test)

    if task_name == "driver-position":
        model = TabPFNRegressor(device=DEVICE)
        metric_fn = regression_metrics
        prob_val = prob_test = None
    else:
        model = TabPFNClassifier(device=DEVICE)
        metric_fn = classification_metrics

    with elapsed_timer() as t:
        model.fit(X_train_en, y_train)
    fit_time = t()

    with elapsed_timer() as t:
        y_val_pred = model.predict(X_val_en)
    pred_time_val = t()

    with elapsed_timer() as t:
        y_test_pred = model.predict(X_test_en)
    pred_time_test = t()

    if task_name != "driver-position":
        try:
            prob_val  = model.predict_proba(X_val_en)[:, 1]
            prob_test = model.predict_proba(X_test_en)[:, 1]
        except Exception:
            prob_val = prob_test = None

    primary_metric_val = list(task.evaluate(
        prob_val if prob_val is not None else y_val_pred,
        target_table=task.get_table("val", mask_input_cols=False)
    ).values())[0]
    primary_metric_test = list(task.evaluate(
        prob_test if prob_test is not None else y_test_pred,
        target_table=task.get_table("test", mask_input_cols=False)
    ).values())[0]

    res = {
        "val": {
            **metric_fn(y_val, y_val_pred, prob_val),
            "fit_time": fit_time,
            "predict_time": pred_time_val,
            "primary_metric_relbench": primary_metric_val,
        },
        "test": {
            **metric_fn(y_test, y_test_pred, prob_test),
            "fit_time": fit_time,
            "predict_time": pred_time_test,
            "primary_metric_relbench": primary_metric_test,
        }
    }
    return res


In [77]:
# %%
TASKS = ["driver-position"]
# "driver-dnf"
# "driver-top3"


all_results = []

for task in TASKS:
    print(f"=== {task} | Single Table ===")
    res_single = run_single_table_experiment(task)
    single_results = []
    for split, metrics in res_single.items():
        result = {"task": task, "setting": "single", "split": split, **metrics}
        all_results.append(result)
        single_results.append(result)
    single_df = pd.DataFrame(single_results)
    print("Table Contents (Single Table):")
    print(single_df)
    single_df.to_csv(f"results_{task}_single.csv", index=False)

    print(f"=== {task} | Merged Table ===")
    res_merged = run_merged_table_experiment(task)
    merged_results = []
    for split, metrics in res_merged.items():
        result = {"task": task, "setting": "merged", "split": split, **metrics}
        all_results.append(result)
        merged_results.append(result)
    merged_df = pd.DataFrame(merged_results)
    print("Table Contents (Merged Table):")
    print(merged_df)
    merged_df.to_csv(f"results_{task}_merged.csv", index=False)

results_df = pd.DataFrame(all_results)
results_df.to_csv("results_summary.csv", index=False)
results_df


=== driver-top3 | Single Table ===
Table Contents (Single Table):
          task setting split  accuracy  f1_macro   roc_auc  fit_time  \
0  driver-top3  single   val  0.794218  0.529640  0.548843  2.904306   
1  driver-top3  single  test  0.792011  0.460807  0.553244  2.904306   

   predict_time  primary_metric_relbench  
0     64.157093                 0.208587  
1     64.925844                 0.167884  
=== driver-top3 | Merged Table ===


  return fnb._ureduce(a,
  return fnb._ureduce(a,
  return fnb._ureduce(a,
  return fnb._ureduce(a,


Table Contents (Merged Table):
          task setting split  accuracy  f1_macro   roc_auc  fit_time  \
0  driver-top3  merged   val  0.794218  0.529640  0.555688  2.892238   
1  driver-top3  merged  test  0.792011  0.460807  0.555282  2.892238   

   predict_time  primary_metric_relbench  
0     69.879107                 0.198263  
1     68.366721                 0.167786  
=== driver-position | Single Table ===


TypeError: got an unexpected keyword argument 'squared'

In [None]:
# %%
def plot_metric(metric):
    sub = results_df[(results_df["split"] == "test") & results_df[metric].notna()]
    if sub.empty:
        return
    pivot = sub.pivot(index="task", columns="setting", values=metric)
    pivot.plot(kind="bar", figsize=(8, 4), title=metric)
    plt.grid(True, axis="y")
    plt.show()

for m in ["roc_auc", "f1_macro", "mae", "fit_time", "predict_time"]:
    plot_metric(m)
