In [1]:
import time
from contextlib import contextmanager
from typing import Dict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
)

import torch

from tqdm import tqdm  # Add tqdm for progress bar
from itertools import product

# RelBench
from relbench.datasets import get_dataset
from relbench.tasks import get_task

import relbench.metrics
import inspect
from skrub import TableVectorizer
from sklearn.model_selection import train_test_split

# TabPFN
from tabpfn import TabPFNClassifier, TabPFNRegressor

# Device preference
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"

print(f"Using device: {DEVICE}")

# Define global dataset variable
DATASET = "rel-f1"




Using device: mps


In [2]:
# Patch relbench.metrics.skm.mean_squared_error to local mean_squared_error
relbench.metrics.skm.mean_squared_error = mean_squared_error

def patched_rmse(true, pred):
    if "squared" in inspect.signature(mean_squared_error).parameters:
        return mean_squared_error(true, pred, squared=False)
    else:
        return np.sqrt(mean_squared_error(true, pred))

relbench.metrics.rmse = patched_rmse

In [3]:
@contextmanager
def elapsed_timer():
    start = time.perf_counter()
    yield lambda: time.perf_counter() - start

def classification_metrics(y_true, y_pred, y_prob=None) -> Dict[str, float]:
    out = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
    }
    if y_prob is not None:
        try:
            out["roc_auc"] = roc_auc_score(y_true, y_prob)
        except Exception:
            out["roc_auc"] = np.nan
    else:
        out["roc_auc"] = np.nan
    return out

def regression_metrics(y_true, y_pred, y_prob=None) -> Dict[str, float]:
    # Accepts y_prob for compatibility, but ignores it
    return {
        "mae": mean_absolute_error(y_true, y_pred),
        "mse": mean_squared_error(y_true, y_pred),
    }

dataset = get_dataset(DATASET)
db = dataset.get_db()

def to_pandas(table):
    if hasattr(table, "to_pandas"):
        return table.to_pandas()
    if hasattr(table, "df"):
        return table.df
    raise ValueError("Unknown table type")

# Convert all tables to pandas DataFrames
tables = {}
for name in db.table_dict:
    tables[name] = to_pandas(db.table_dict[name])

for t in tables.values():
    for col in t.columns:
        if "date" in col.lower():
            t[col] = pd.to_datetime(t[col], errors="coerce")

Loading Database object from /Users/michaelflppv/Library/Caches/relbench/rel-f1/db...
Done in 0.08 seconds.


In [4]:
# --- NEW CELL: skrub preprocessing helpers ---

def build_tv():
    """
    TableVectorizer turns mixed (numeric + categorical + text + datetime) columns
    into a numeric feature matrix. We keep defaults to match the tutorials’ simplicity.
    """
    return TableVectorizer()

def fit_transform_splits(tv, X_train_df, X_val_df=None, X_test_df=None):
    """
    Fit TV on train only; transform val/test. Cast to float32 for TabPFN.
    """
    # Ensure no NaN or infinite values before vectorizing
    X_train_df = X_train_df.replace([np.inf, -np.inf], np.nan).fillna(0)
    if X_val_df is not None:
        X_val_df = X_val_df.replace([np.inf, -np.inf], np.nan).fillna(0)
    if X_test_df is not None:
        X_test_df = X_test_df.replace([np.inf, -np.inf], np.nan).fillna(0)
    Xt_train = tv.fit_transform(X_train_df).astype(np.float32)
    Xt_val   = tv.transform(X_val_df).astype(np.float32) if X_val_df is not None else None
    Xt_test  = tv.transform(X_test_df).astype(np.float32) if X_test_df is not None else None
    return Xt_train, Xt_val, Xt_test, tv

def _get_df(table):
    # Your tutorial-style accessor: works whether object has `.df` or `.to_pandas()`
    if hasattr(table, "df"):
        return table.df
    if hasattr(table, "to_pandas"):
        return table.to_pandas()
    raise ValueError("Unknown table type for conversion to DataFrame.")


In [5]:
# --- REPLACE the whole function: run_single_table_experiment ---

def run_single_table_experiment(task_name: str):
    """
    Apply TabPFN to the single base table provided by the task.
    Uses skrub.TableVectorizer to encode all non-numerical columns.
    """
    global prob_val, prob_test

    # Load task and splits (tutorial API)
    task = get_task(DATASET, task_name)
    train_table = task.get_table("train")
    val_table   = task.get_table("val")
    test_table  = task.get_table("test", mask_input_cols=False)

    # Convert to pandas (tutorial style)
    df_train = _get_df(train_table).copy()
    df_val   = _get_df(val_table).copy()
    df_test  = _get_df(test_table).copy()

    # Optional subsampling to keep parity with your original notebook
    df_train = df_train.sample(n=min(1000, len(df_train)), random_state=42)
    df_val   = df_val.sample(n=min(1000, len(df_val)), random_state=42)

    # Target column name taken from the task object (tutorials use task.target_col)
    target_col = task.target_col

    # Split X/y; keep ALL feature columns (skrub will encode mixed types)
    X_train_df = df_train.drop(columns=[target_col])
    y_train    = df_train[target_col]

    X_val_df = df_val.drop(columns=[target_col])
    y_val    = df_val[target_col]

    X_test_df = df_test.drop(columns=[target_col])
    y_test    = df_test[target_col]

    # Build + fit TableVectorizer on train only; transform val/test
    tv = build_tv()
    Xt_train, Xt_val, Xt_test, tv = fit_transform_splits(tv, X_train_df, X_val_df, X_test_df)

    # Decide model family (simple rule compatible with the tutorials)
    is_regression = pd.api.types.is_float_dtype(y_train) and y_train.nunique() > 10

    if is_regression:
        model = TabPFNRegressor(device=DEVICE)
        metric_fn = regression_metrics
    else:
        model = TabPFNClassifier(device=DEVICE)
        metric_fn = classification_metrics

    # Fit + time it
    with elapsed_timer() as t_fit:
        model.fit(Xt_train, y_train)
    fit_time = t_fit()

    # Predict + time
    with elapsed_timer() as t_pred_val:
        y_val_pred = model.predict(Xt_val)
        prob_val = None
        if not is_regression and hasattr(model, "predict_proba"):
            prob_val = model.predict_proba(Xt_val)
    pred_time_val = t_pred_val()

    with elapsed_timer() as t_pred_test:
        y_test_pred = model.predict(Xt_test)
        prob_test = None
        if not is_regression and hasattr(model, "predict_proba"):
            prob_test = model.predict_proba(Xt_test)
    pred_time_test = t_pred_test()

    # Primary metric to mirror RelBench style: accuracy for clf, MAE for reg
    if is_regression:
        primary_metric_val  = mean_absolute_error(y_val,  y_val_pred)
        primary_metric_test = mean_absolute_error(y_test, y_test_pred)
    else:
        primary_metric_val  = accuracy_score(y_val,  y_val_pred)
        primary_metric_test = accuracy_score(y_test, y_test_pred)

    res = {
        "val": {
            **metric_fn(y_val, y_val_pred, prob_val),
            "fit_time": fit_time,
            "predict_time": pred_time_val,
            "primary_metric_relbench": primary_metric_val,
        },
        "test": {
            **metric_fn(y_test, y_test_pred, prob_test),
            "fit_time": fit_time,
            "predict_time": pred_time_test,
            "primary_metric_relbench": primary_metric_test,
        }
    }
    return res


In [6]:
# Create features for drivers based on their past performance
# This function creates features like average position, points, DNF rate, and average laps
def engineer_driver_features():
    # Extract the race dates and results
    results = tables["results"].merge(
        tables["races"][["raceId", "date"]],
        on="raceId", how="left"
    )

    print("results columns after merge:", results.columns.tolist())
    print("Number of rows after merge:", results.shape[0])
    results = results.dropna(subset=["driverId", "date_y"])
    print("Number of rows after dropna:", results.shape[0])

    # Create a did not finish (DNF) flag as indicator for future race outcomes
    results["dnf_flag"] = (~results["positionOrder"].isna()).astype(int)
    # For each driver, calculate the average position, points, DNF rate, and average laps
    # Only the information before the current race is used
    feats = results.groupby("driverId").expanding().agg({
        "positionOrder": "mean",
        "points": "mean",
        "dnf_flag": "mean",
        "laps": "mean"
    }).reset_index()
    feats = feats.rename(columns={
        "positionOrder": "avg_position",
        "points": "avg_points",
        "dnf_flag": "dnf_rate",
        "laps": "avg_laps"
    })
    feats["date"] = results["date_y"].values
    return feats

driver_feats = engineer_driver_features()

results columns after merge: ['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionOrder', 'points', 'laps', 'milliseconds', 'fastestLap', 'rank', 'statusId', 'date_x', 'date_y']
Number of rows after merge: 20323
Number of rows after dropna: 20323


In [7]:
# --- REPLACE the whole function: run_merged_table_experiment ---

def run_merged_table_experiment(task_name: str):
    """
    Build a merged, wide table using the schema:
      results ← races ← circuits
      results ← drivers
      results ← constructors
      results ← standings (driver-level)
      results ← constructor_results
      results ← constructor_standings
      results ← qualifying
    Then apply TabPFN with skrub.
    """
    global prob_val, prob_test

    # Load task and splits
    task = get_task(DATASET, task_name)
    train_table = task.get_table("train")
    val_table   = task.get_table("val")
    test_table  = task.get_table("test", mask_input_cols=False)

    # Convert to pandas
    base_train = _get_df(train_table).copy()
    base_val   = _get_df(val_table).copy()
    base_test  = _get_df(test_table).copy()

    # Optional subsampling (to match your original)
    base_train = base_train.sample(n=min(1000, len(base_train)), random_state=42)
    base_val   = base_val.sample(n=min(1000, len(base_val)), random_state=42)

    # Helper: one merge function respecting keys if present
    def _merge_all(base_df: pd.DataFrame) -> pd.DataFrame:
        # Tag each row with its original position to guard against one‐to‐many blowup
        df = base_df.copy().reset_index(drop=False).rename(columns={'index': '_orig_index'})

        # results (most tasks for F1 are grounded there; join if not already)
        results = tables["results"]
        join_keys = [k for k in ["raceId","driverId","constructorId"] if k in df.columns and k in results.columns]
        if join_keys:
            df = df.merge(results, how="left", on=join_keys, suffixes=("","_res"))

        # races
        if "raceId" in df.columns:
            races = tables["races"][["raceId","year","round","circuitId","name","date","time"]]
            df = df.merge(races, how="left", on="raceId", suffixes=("","_race"))

        # circuits
        if "circuitId" in df.columns:
            circuits = tables["circuits"][["circuitId","name","location","country","lat","lng","alt"]]
            df = df.merge(circuits, how="left", on="circuitId", suffixes=("","_circuit"))

        # drivers
        if "driverId" in df.columns:
            drivers = tables["drivers"][["driverId","driverRef","code","forename","surname","dob","nationality"]]
            df = df.merge(drivers, how="left", on="driverId", suffixes=("","_drv"))

        # constructors
        if "constructorId" in df.columns:
            constructors = tables["constructors"][["constructorId","constructorRef","name","nationality"]]
            df = df.merge(constructors, how="left", on="constructorId", suffixes=("","_con"))

        # standings (driver-level)
        if "raceId" in df.columns and "driverId" in df.columns and "standings" in tables:
            drv_st = tables["standings"][["raceId","driverId","points","position","wins","date"]].rename(
                columns={"points":"drv_points_to_date","position":"drv_pos_to_date","wins":"drv_wins_to_date","date":"drv_standings_date"}
            )
            df = df.merge(drv_st, how="left", on=["raceId","driverId"])

        # constructor_results
        if "raceId" in df.columns and "constructorId" in df.columns and "constructor_results" in tables:
            cr = tables["constructor_results"][["raceId","constructorId","points","date"]].rename(
                columns={"points":"con_points_at_race","date":"con_results_date"}
            )
            df = df.merge(cr, how="left", on=["raceId","constructorId"])

        # constructor_standings
        if "raceId" in df.columns and "constructorId" in df.columns and "constructor_standings" in tables:
            cs = tables["constructor_standings"][["raceId","constructorId","points","position","wins","date"]].rename(
                columns={"points":"con_points_to_date","position":"con_pos_to_date","wins":"con_wins_to_date","date":"con_standings_date"}
            )
            df = df.merge(cs, how="left", on=["raceId","constructorId"])

        # qualifying
        if "raceId" in df.columns and "driverId" in df.columns and "constructorId" in df.columns and "qualifying" in tables:
            q = tables["qualifying"][["raceId","driverId","constructorId","number","position"]].rename(
                columns={"number":"quali_number","position":"quali_position"}
            )
            df = df.merge(q, how="left", on=["raceId","driverId","constructorId"])

        # --- Light feature engineering (schema-aware) ---
        # Driver age on race date
        if "dob" in df.columns:
            # race date can be from results or races; use the first date-like col available
            date_cols = [c for c in df.columns if "date" in c.lower()]
            race_date = None
            for c in ["date", "date_race", "date_y", "date_res"] + date_cols:
                if c in df.columns:
                    race_date = pd.to_datetime(df[c], errors="coerce")
                    break
            dob = pd.to_datetime(df["dob"], errors="coerce")
            if race_date is not None:
                df["driver_age_years"] = (race_date - dob).dt.days / 365.25

        # Calendar features from main date if present
        if "date" in df.columns:
            dt = pd.to_datetime(df["date"], errors="coerce")
            df["race_year"]  = dt.dt.year
            df["race_month"] = dt.dt.month
            df["race_day"]   = dt.dt.day
            df["race_dow"]   = dt.dt.dayofweek

        # Grid-based derived feature if available
        if "grid" in df.columns and "position" in df.columns:
            df["grid_advancement"] = df["grid"] - df["position"]

        # Tie qualifying to grid if both present
        if "quali_position" in df.columns and "grid" in df.columns:
            df["quali_to_grid_delta"] = df["quali_position"] - df["grid"]

        # --- NEW: merge engineered driver-features as of race date ---
        if "driverId" in df.columns:
            # find the appropriate race-date column
            date_cols = [c for c in df.columns if "date" in c.lower()]
            for c in ["date", "date_race", "date_y", "date_res"] + date_cols:
                if c in df.columns:
                    df["_race_date"] = pd.to_datetime(df[c], errors="coerce")
                    break
            # prepare driver_feats for asof-join
            feats = driver_feats.rename(columns={"date": "feat_date"})
            feats = feats.sort_values(["driverId", "feat_date"])
            df = df.sort_values(["driverId", "_race_date"])
            # only bring in stats from before each race
            df = pd.merge_asof(
                df,
                feats,
                left_on="_race_date",
                right_on="feat_date",
                by="driverId",
                direction="backward",
            )
            df.drop(columns=["feat_date"], inplace=True)

        # Drop any duplicates and restore original row count
        df = df.sort_values('_orig_index').drop_duplicates(subset=['_orig_index'])
        df = df.drop(columns=['_orig_index', '_race_date'])
        return df

    train_merged = _merge_all(base_train)
    val_merged   = _merge_all(base_val)
    test_merged  = _merge_all(base_test)

    target_col = task.target_col
    # Safety: drop target if it appears in merged views (RelBench warns about leakage)
    for df in (train_merged, val_merged, test_merged):
        if target_col in df.columns:
            df.drop(columns=[target_col], inplace=True)

    # X/y
    X_train_df, y_train = train_merged, base_train[target_col]
    X_val_df,   y_val   = val_merged,   base_val[target_col]
    X_test_df,  y_test  = test_merged,  base_test[target_col]

    # skrub vectorization
    tv = build_tv()
    Xt_train, Xt_val, Xt_test, tv = fit_transform_splits(tv, X_train_df, X_val_df, X_test_df)

    # Decide model family (keep same rule as single-table)
    is_regression = pd.api.types.is_float_dtype(y_train) and y_train.nunique() > 10

    if is_regression:
        model = TabPFNRegressor(device=DEVICE)
        metric_fn = regression_metrics
    else:
        model = TabPFNClassifier(device=DEVICE)
        metric_fn = classification_metrics

    # Fit + time
    with elapsed_timer() as t_fit:
        model.fit(Xt_train, y_train)
    fit_time = t_fit()

    # Predict + time
    with elapsed_timer() as t_pred_val:
        y_val_pred = model.predict(Xt_val)
        prob_val = None
        if not is_regression and hasattr(model, "predict_proba"):
            prob_val = model.predict_proba(Xt_val)
    pred_time_val = t_pred_val()

    with elapsed_timer() as t_pred_test:
        y_test_pred = model.predict(Xt_test)
        prob_test = None
        if not is_regression and hasattr(model, "predict_proba"):
            prob_test = model.predict_proba(Xt_test)
    pred_time_test = t_pred_test()

    # Primary metric (simple, tutorial-friendly)
    if is_regression:
        primary_metric_val  = mean_absolute_error(y_val,  y_val_pred)
        primary_metric_test = mean_absolute_error(y_test, y_test_pred)
    else:
        primary_metric_val  = accuracy_score(y_val,  y_val_pred)
        primary_metric_test = accuracy_score(y_test, y_test_pred)

    res = {
        "val": {
            **metric_fn(y_val, y_val_pred, prob_val),
            "fit_time": fit_time,
            "predict_time": pred_time_val,
            "primary_metric_relbench": primary_metric_val,
        },
        "test": {
            **metric_fn(y_test, y_test_pred, prob_test),
            "fit_time": fit_time,
            "predict_time": pred_time_test,
            "primary_metric_relbench": primary_metric_test,
        }
    }
    return res


In [8]:
# TASKS = ["driver-dnf", "driver-top3", "driver-position"]
TASKS = ["driver-dnf"]

all_results_long = []

def results_to_long(task, setting, split, metrics):
    rows = []
    for metric_name, value in metrics.items():
        # Only add row if value is not None and not NaN
        if value is not None and not (isinstance(value, float) and np.isnan(value)):
            rows.append({
                "dataset": DATASET,  # Fill with DATASET variable
                "task": task,
                "split": split,
                "setting": setting,
                "method": "TabPFN_experiment_v1.0",
                "metric": metric_name,
                "score": value
            })
    return rows

# Use tqdm to show progress bar for tasks and settings
for task in tqdm(TASKS, desc="Tasks"):
    task_setting_pairs = list(product(TASKS, [("single", run_single_table_experiment), ("merged", run_merged_table_experiment)]))

    for (task, (setting, run_fn)) in tqdm(task_setting_pairs, desc="Task/Setting pairs"):
        res = run_fn(task)
        for split, metrics in res.items():
            all_results_long.extend(results_to_long(task, setting, split, metrics))

results_long_df = pd.DataFrame(all_results_long)
# Ensure 'dataset' is the first column
cols = ["dataset"] + [col for col in results_long_df.columns if col != "dataset"]
results_long_df = results_long_df[cols]
# Round all numerical values in 'score' to 4 decimal places
results_long_df["score"] = results_long_df["score"].apply(lambda x: round(x, 4) if isinstance(x, (float, int, np.floating, np.integer)) else x)

results_long_df.to_csv("results_summary_long.csv", index=False)
results_long_df

Tasks:   0%|          | 0/1 [00:00<?, ?it/s]
Task/Setting pairs:   0%|          | 0/2 [03:25<?, ?it/s][A
Tasks:   0%|          | 0/1 [03:25<?, ?it/s]


KeyboardInterrupt: 

In [None]:
def plot_metric(metric, split=None, setting=None):
    """
    Plot the given metric from results_long_df.
    Optionally filter by split and/or setting.
    """
    df = results_long_df[results_long_df["metric"] == metric]
    if split is not None:
        df = df[df["split"] == split]
    if setting is not None:
        df = df[df["setting"] == setting]
    if df.empty:
        print(f"No data for metric: {metric}, split: {split}, setting: {setting}")
        return
    ax = df.pivot(index="task", columns="setting", values="score").plot(kind="bar")
    ax.set_ylabel(metric)
    ax.set_title(f"{metric} by task and setting")
    ax.legend(title="setting")
    plt.tight_layout()
    plt.show()

# Example usage:
plot_metric("accuracy", split="test")
plot_metric("f1_macro", split="val")
plot_metric("primary_metric_relbench", split="test")
