In [25]:
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from tabpfn import TabPFNClassifier
from sklearn.metrics import accuracy_score
from itertools import product

In [26]:
def target_encode_oof(df, y, cat_cols, n_splits=5, m=10, seed=42):
    global_mean = y.mean()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    oof = pd.DataFrame(index=df.index)
    mappings = {}

    for col in cat_cols:
        oof_col = pd.Series(index=df.index, dtype="float64")

        for tr_idx, val_idx in kf.split(df):
            tr_x = df.iloc[tr_idx]
            tr_y = y.iloc[tr_idx]

            stats = tr_y.groupby(tr_x[col]).agg(["mean", "count"])
            smooth = (stats["mean"] * stats["count"] + global_mean * m) / (stats["count"] + m)

            oof_col.iloc[val_idx] = df.iloc[val_idx][col].map(smooth)

        oof[col] = oof_col.fillna(global_mean)

        # Mapping f√ºrs echte Val/Test
        stats_full = y.groupby(df[col]).agg(["mean", "count"])
        mappings[col] = (stats_full["mean"] * stats_full["count"] + global_mean * m) / (stats_full["count"] + m)

    return oof, mappings, global_mean

def apply_te(df, mappings, global_mean):
    out = pd.DataFrame(index=df.index)
    for col, mapping in mappings.items():
        out[col] = df[col].map(mapping).fillna(global_mean)
    return out


In [27]:
train = pd.read_csv("../data/grandprix_features_train.csv")
val = pd.read_csv("../data/grandprix_features_val.csv")
test  = pd.read_csv("../data/grandprix_features_test.csv")

test.head()

Unnamed: 0,year,driver_id,constructor_id,circuit_id,grid_position,quali_delta,quali_tm_delta,season_pts_driver,season_pts_team,last_3_avg,is_street_circuit,is_wet,points_scored
0,2025,norris,mclaren,melbourne,1,0.0,-0.084,0.0,0.0,0.0,1,1,1
1,2025,verstappen,red_bull_racing,melbourne,3,0.385,-1.613,0.0,0.0,0.0,1,1,1
2,2025,russell,mercedes,melbourne,4,0.45,-0.979,0.0,0.0,0.0,1,1,1
3,2025,antonelli,mercedes,melbourne,16,1.429,0.979,0.0,15.0,0.0,1,1,1
4,2025,albon,williams,melbourne,6,0.641,-0.194,0.0,0.0,0.0,1,1,1


In [None]:
CAT_COLS = [
    "driver_id",
    "constructor_id",
    "circuit_id",
]
NUM_COLS = [
    "grid_position",
    "quali_delta",
    "quali_tm_delta",
    "season_pts_driver",
    "season_pts_team",
    "last_3_avg",
    "is_street_circuit",
    "is_wet",
]

TARGET = 'points_scored'
for col in CAT_COLS:
    all_vals = pd.concat([train[col], val[col], test[col]], axis=0)
    codes = all_vals.astype("category").cat.categories
    mapping = {v: i for i, v in enumerate(codes)}
    for df in (train, val, test):
        df[col] = df[col].map(mapping).astype("int64")
        
X_train, y_train = train[CAT_COLS + NUM_COLS], train[TARGET]
X_val, y_val = val[CAT_COLS + NUM_COLS], val[TARGET]
X_test, y_test = test[CAT_COLS + NUM_COLS], test[TARGET]

X_train

te_train, mappings, gmean = target_encode_oof(train, y_train, CAT_COLS)

X_train = pd.concat([train[NUM_COLS].reset_index(drop=True), te_train.add_suffix("_te").reset_index(drop=True)], axis=1)

X_val = pd.concat([val[NUM_COLS].reset_index(drop=True), apply_te(val, mappings, gmean).add_suffix("_te").reset_index(drop=True)], axis=1)

X_test = pd.concat([test[NUM_COLS].reset_index(drop=True), apply_te(test, mappings, gmean).add_suffix("_te").reset_index(drop=True)], axis=1)



In [None]:
for m, te_splits in product([5, 10, 20], [5, 10]):
    te_train, mappings, gmean = target_encode_oof(train, y_train, CAT_COLS, n_splits=te_splits, m=m)

    X_tr = pd.concat([train[NUM_COLS].reset_index(drop=True), te_train.add_suffix("_te").reset_index(drop=True)], axis=1)
    X_va = pd.concat([val[NUM_COLS].reset_index(drop=True), apply_te(val, mappings, gmean).add_suffix("_te").reset_index(drop=True)], axis=1)

    clf = TabPFNClassifier(random_state=0)
    clf.fit(X_tr, y_train)
    predictions = clf.predict(X_val)
    acc = accuracy_score(y_val, predictions)
    print(f"m={m}, te_splits={te_splits} -> acc={acc:.4f}")


m=5, te_splits=5 -> acc=0.8413
m=5, te_splits=10 -> acc=0.8330
m=10, te_splits=5 -> acc=0.8392
m=10, te_splits=10 -> acc=0.8330
m=20, te_splits=5 -> acc=0.8372
m=20, te_splits=10 -> acc=0.8288
