In [63]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import json

DATA_DIR = Path("data")
CLEAN_DIR = DATA_DIR / "clean"
MODELS_DIR = Path("models")

startlists_path = CLEAN_DIR / "startlists_enriched.csv"
model_path = MODELS_DIR / "baseline_rank_model.joblib"
meta_path = MODELS_DIR / "feature_meta.json"

print("Loading:\n -", startlists_path, "\n -", model_path, "\n -", meta_path)

startlists = pd.read_csv(startlists_path, parse_dates=["race_date"])
model = joblib.load(model_path)

with open(meta_path, "r") as f:
    meta = json.load(f)
feature_cols = meta["feature_cols"]

startlists.head()


Loading:
 - data/clean/startlists_enriched.csv 
 - models/baseline_rank_model.joblib 
 - models/feature_meta.json


Unnamed: 0,Nr,Naam,UCI ID,Nat,Club,UCI Rank,series_name,race_name,race_date,race_location,...,merge_id,race_id_y,races_so_far,avg_place_last3,best_place_last5,last_place,days_since_last_race,last_carried_points,last_scored_points,Place
0,1.0,AERTS Toon,10007590000.0,BEL,DESCHACHT-HENS CX TEAM,2.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10007590000.0,,,,,,,,,
1,2.0,NYS Thibau,10065000000.0,BEL,BALOISE GLOWI LIONS,9.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10065000000.0,,,,,,,,,
2,3.0,VANTHOURENHOUT Michael,10007160000.0,BEL,PAUWELS SAUZEN - ALTEZ INDUSTRIEBOUW CT,1.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10007160000.0,,,,,,,,,
3,4.0,VANDEPUTTE Niels,10016330000.0,BEL,ALPECIN-DECEUNINCK DEVELOPMENT TEAM,4.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10016330000.0,,,,,,,,,
4,5.0,WYSEURE Joran,10064920000.0,BEL,CRELAN-CORENDON,5.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10064920000.0,,,,,,,,,


In [65]:
def clean_startlist(df):
    df = df.copy()

    # Remove header rows (common in PDF extraction)
    header_mask = df.get("rider_name", pd.Series(index=df.index, dtype=bool)).fillna("").str.contains(
        "UCI ID", case=False, na=False
    )
    df = df[~header_mask]

    # Ensure category_full exists
    if "category_full" not in df.columns:
        df["category_full"] = np.nan

    # Fix NaN in category_full by falling back to category_hint (if present)
    if "category_hint" in df.columns:
        df["category_full"] = df["category_full"].fillna(df["category_hint"])

    # Last fallback: default to "Unknown" for anything still missing
    df["category_full"] = df["category_full"].fillna("Unknown")

    # Normalize category capitalization
    df["category_full"] = (
        df["category_full"]
        .astype(str)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
    )

    # Optionally map to a canonical set
    mapping = {
        "men elite": "Men Elite",
        "women elite": "Women Elite",
        "men u23": "Men U23",
        "u23": "Men U23",
        "men junior": "Men Junior",
        "junior": "Men Junior",
    }
    df["category_full"] = df["category_full"].str.lower().map(mapping).fillna(df["category_full"])

    return df

startlists = clean_startlist(startlists)
startlists.head()


Unnamed: 0,Nr,Naam,UCI ID,Nat,Club,UCI Rank,series_name,race_name,race_date,race_location,...,merge_id,race_id_y,races_so_far,avg_place_last3,best_place_last5,last_place,days_since_last_race,last_carried_points,last_scored_points,Place
0,1.0,AERTS Toon,10007590000.0,BEL,DESCHACHT-HENS CX TEAM,2.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10007590000.0,,,,,,,,,
1,2.0,NYS Thibau,10065000000.0,BEL,BALOISE GLOWI LIONS,9.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10065000000.0,,,,,,,,,
2,3.0,VANTHOURENHOUT Michael,10007160000.0,BEL,PAUWELS SAUZEN - ALTEZ INDUSTRIEBOUW CT,1.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10007160000.0,,,,,,,,,
3,4.0,VANDEPUTTE Niels,10016330000.0,BEL,ALPECIN-DECEUNINCK DEVELOPMENT TEAM,4.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10016330000.0,,,,,,,,,
4,5.0,WYSEURE Joran,10064920000.0,BEL,CRELAN-CORENDON,5.0,,Telenet Superprestige AARDBEIENCROSS-MERKSPLAS...,NaT,,...,10064920000.0,,,,,,,,,


In [67]:
# Prepare features safely
X = startlists[feature_cols].copy()

# Find a safe default median place (if your historical results had Places)
# If everything is NaN → fallback to 25
safe_place_default = (
    startlists["Place"].median()
    if ("Place" in startlists.columns and startlists["Place"].notna().any())
    else 25
)

X = X.fillna({
    "races_so_far": 0,
    "avg_place_last3": safe_place_default,
    "best_place_last5": safe_place_default,
    "last_place": safe_place_default,
    "days_since_last_race": 30,
    "last_carried_points": 0,
    "last_scored_points": 0,
})


In [69]:
# --- FIX: Ensure race_id exists and is clean ---

# If race_id exists, keep it.
if "race_id" in startlists.columns:
    pass

# If merge created race_id_x / race_id_y, fix it.
elif "race_id_x" in startlists.columns:
    startlists["race_id"] = startlists["race_id_x"]

elif "race_id_y" in startlists.columns:
    startlists["race_id"] = startlists["race_id_y"]

# Last resort: if nothing exists, build a placeholder
else:
    startlists["race_id"] = "unknown_race"

# Clean type and missing values
startlists["race_id"] = startlists["race_id"].fillna("unknown_race").astype(str)

print("Race ID distribution:")
print(startlists["race_id"].value_counts().head())


Race ID distribution:
race_id
unknown_standalone_x2o-badkamers-trofee-rectavit-flandriencross-hamme-16-11-2025_noloc    189
unknown_standalone_telenet-superprestige-aardbeiencross-merksplas-15-11-2025_noloc         81
Name: count, dtype: int64


In [70]:
y_rank_pred = model.predict(X)

# Prevent division by zero
y_rank_pred = np.clip(y_rank_pred, 1e-6, None)

startlists["predicted_place"] = 1.0 / y_rank_pred

# Predicted rank = sorted order
startlists["predicted_rank"] = (
    startlists.groupby(["race_id", "category_full"])["predicted_place"]
              .rank(method="first")
)


In [71]:
for (rid, cat), grp in startlists.groupby(["race_id", "category_full"]):

    print("\n==============================")
    print(f"RACE: {rid}")
    print(f"CATEGORY: {cat}")
    print("==============================\n")

    # dynamic columns
    cols = ["rider_name", "predicted_place", "predicted_rank"]
    if "UCI Rank" in grp.columns:
        cols.insert(1, "UCI Rank")
    if "UCI ID" in grp.columns:
        cols.insert(1, "UCI ID")

    display(
        grp[cols]
        .sort_values("predicted_rank")
        .reset_index(drop=True)
        .head(25)
    )



RACE: unknown_standalone_telenet-superprestige-aardbeiencross-merksplas-15-11-2025_noloc
CATEGORY: Men Elite



Unnamed: 0,rider_name,UCI ID,UCI Rank,predicted_place,predicted_rank
0,AERTS Toon,10007590000.0,2.0,10.782739,1.0
1,NYS Thibau,10065000000.0,9.0,10.782739,2.0
2,VANTHOURENHOUT Michael,10007160000.0,1.0,10.782739,3.0
3,VANDEPUTTE Niels,10016330000.0,4.0,10.782739,4.0
4,WYSEURE Joran,10064920000.0,5.0,10.782739,5.0
5,SWEECK Laurens,10006910000.0,7.0,10.782739,6.0
6,VAN DER HAAR Lars,10006120000.0,8.0,10.782739,7.0
7,RONHAAR Pim,10023110000.0,10.0,10.782739,8.0
8,ORTS LLORET Felipe,10009000000.0,11.0,10.782739,9.0
9,KUHN Kevin,10009750000.0,14.0,10.782739,10.0



RACE: unknown_standalone_telenet-superprestige-aardbeiencross-merksplas-15-11-2025_noloc
CATEGORY: Women Elite



Unnamed: 0,rider_name,UCI ID,UCI Rank,predicted_place,predicted_rank
0,VAN DER HEIJDEN Inge,10010590000.0,3.0,10.782739,1.0
1,BRAND Lucinda,10006610000.0,1.0,10.782739,2.0
2,BAKKER Manon,10009520000.0,8.0,10.782739,3.0
3,CASASOLA Sara,10011000000.0,11.0,10.782739,4.0
4,BENTVELD Leonie,10023340000.0,12.0,10.782739,5.0
5,BROUWERS Julie,10065010000.0,13.0,10.782739,6.0
6,VAN ALPHEN Aniek,10015540000.0,14.0,10.782739,7.0
7,CLAUZEL Hélène,10009880000.0,15.0,10.782739,8.0
8,BETSEMA Denise,10007650000.0,16.0,10.782739,9.0
9,FOUQUENET Amandine,10066770000.0,17.0,10.782739,10.0



RACE: unknown_standalone_x2o-badkamers-trofee-rectavit-flandriencross-hamme-16-11-2025_noloc
CATEGORY: Unknown



Unnamed: 0,rider_name,UCI ID,UCI Rank,predicted_place,predicted_rank
0,SVOBODA David,10047110000.0,13.0,10.782739,1.0
1,HEEREN Delano,10045770000.0,18.0,10.782739,2.0
2,LIPPENS Brent,10106050000.0,27.0,10.782739,3.0
3,VAN LEE Jari,10065220000.0,28.0,10.782739,4.0
4,OSAER Emiel,10074880000.0,36.0,10.782739,5.0
5,SMITS Maxime,10108170000.0,58.0,10.782739,6.0
6,VAN HOOF Kai,10059750000.0,61.0,10.782739,7.0
7,DE PEUTER Toon,10065230000.0,62.0,10.782739,8.0
8,JANSSENS Arthur,10065220000.0,69.0,10.782739,9.0
9,DECLERCQ Matteo,10085070000.0,80.0,10.782739,10.0


In [72]:
output_dir = CLEAN_DIR / "predictions"
output_dir.mkdir(exist_ok=True)

saved_files = []

for (rid, cat), grp in startlists.groupby(["race_id", "category_full"]):
    safe_cat = cat.lower().replace(" ", "-")
    out_path = output_dir / f"{rid}_{safe_cat}.csv"

    grp.to_csv(out_path, index=False)
    saved_files.append(out_path)

saved_files


[PosixPath('data/clean/predictions/unknown_standalone_telenet-superprestige-aardbeiencross-merksplas-15-11-2025_noloc_men-elite.csv'),
 PosixPath('data/clean/predictions/unknown_standalone_telenet-superprestige-aardbeiencross-merksplas-15-11-2025_noloc_women-elite.csv'),
 PosixPath('data/clean/predictions/unknown_standalone_x2o-badkamers-trofee-rectavit-flandriencross-hamme-16-11-2025_noloc_unknown.csv')]

In [73]:
combined_path = CLEAN_DIR / "predicted_rankings_all.csv"
startlists.to_csv(combined_path, index=False)

combined_path


PosixPath('data/clean/predicted_rankings_all.csv')

In [76]:
import pdfplumber

x2o_path = "/Users/marlex/projects/cyclocross-predictions/data/startlists/X2O BADKAMERS TROFEE RECTAVIT-FLANDRIENCROSS-HAMME 16:11:2025 STARTLIST.pdf"

with pdfplumber.open(x2o_path) as pdf:
    text = pdf.pages[0].extract_text()
    print("\n".join(text.split("\n")[:140]))


FLANDRIENCROSS-HAMME 16/11/2025
STARTLIST MEN JUNIOR
Nr Naam UCI ID Nat Club UCI Rank
1 SVOBODA David 10047109850 CZE BRILON RACING TEAM MB 13
2 HEEREN Delano 10045767311 NED ACROG TORMANS 18
3 LIPPENS Brent 10106047656 BEL C.T. KEUKENS BUYSSE KNESSELARE VZW 27
4 VAN LEE Jari 10065219649 BEL BIOMETRIC - CANYON CT 28
5 OSAER Emiel 10074876708 BEL STARBIKES - VISTABUILD 36
6 SMITS Maxime 10108174380 BEL ACROG-TORMANS 58
7 VAN HOOF Kai 10059748041 NED TWC HET SNELLE WIEL 61
8 DE PEUTER Toon 10065230763 BEL UCT CYCLING TEAM 62
9 JANSSENS Arthur 10065223588 BEL GOLAZO YOUNG LIONS 69
10 DECLERCQ Matteo 10085069788 BEL WIELERCLUB "ONDER - ONS PARIKE" VZW 80
11 SPRANGERS Seppe 10112980631 BEL ACROG-TORMANS 84
12 BEETGE Tanner 10147572548 GBR DONOVAN RACING DEVELOPMENT 100
13 ROIJACKERS Matt 10096529532 NED ZZPR ORANGE BABIES CYCLING TEAM 109
14 GEERTS Len 10077916848 BEL WAC TEAM VZW 119
15 MAENEN Dani 10138428276 BEL BFY CYCLINGTEAM 132
16 RENDERS Luca 10125887186 BEL 4BIKES WESTFIT 147
17 RO