In [1]:
import pandas as pd
from fev.analysis import leaderboard


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
BASE_URL = "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/fev_bench/results/"

MODELS = [
    "autoarima",
    "autoets",
    "autotheta",
    "catboost",
    "chronos-2",
    "chronos-bolt",
    "drift",
    "lightgbm",
    "moirai-2_0",
    "naive",
    "seasonal_naive",
    "stat_ensemble",
    "sundial-base",
    "tabpfn-ts",
    "timesfm-2_5",
    "tirex",
    "toto-1_0",
]


all_dfs = []
for model in MODELS:
    url = f"{BASE_URL}{model}.csv"
    try:
        df = pd.read_csv(url)
        all_dfs.append(df)
        print(f"Loaded {model} ({len(df)} rows)")
    except Exception as e:
        print(f"FAILED {model}: {e}")

all_results = pd.concat(all_dfs, ignore_index=True)

autoar_df = pd.read_csv("autoar_full.csv")
all_results = pd.concat([all_results, autoar_df], ignore_index=True)
print(
    f"\nTotal rows: {len(all_results)}, models: {all_results['model_name'].nunique()}"
)


Loaded autoarima (90 rows)
Loaded autoets (97 rows)
Loaded autotheta (100 rows)
Loaded catboost (100 rows)
Loaded chronos-2 (100 rows)
Loaded chronos-bolt (100 rows)
Loaded drift (100 rows)
Loaded lightgbm (100 rows)
Loaded moirai-2_0 (100 rows)
Loaded naive (100 rows)
Loaded seasonal_naive (100 rows)
Loaded stat_ensemble (89 rows)
Loaded sundial-base (100 rows)
Loaded tabpfn-ts (98 rows)
Loaded timesfm-2_5 (100 rows)
Loaded tirex (100 rows)
Loaded toto-1_0 (100 rows)

Total rows: 1773, models: 18


In [4]:
lb = leaderboard(all_results, missing_strategy="drop", baseline_model="Seasonal Naive")
lb


85 tasks left after removing failures


Unnamed: 0_level_0,win_rate,skill_score,median_training_time_s,median_inference_time_s,training_corpus_overlap,num_failures
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chronos-2,0.910727,0.484519,0.0,1.732712,0.0,0
TiRex,0.836678,0.438364,0.0,1.074211,0.0,0
TimesFM-2.5,0.824913,0.471155,0.0,7.050097,0.094118,0
Toto-1.0,0.731488,0.419221,0.0,73.182442,0.070588,0
Moirai-2.0,0.697578,0.414025,0.0,1.948326,0.258824,0
TabPFN-TS,0.690657,0.419319,0.0,282.412571,0.0,2
Chronos-Bolt,0.687197,0.404197,0.0,0.928959,0.0,0
Sundial-Base,0.502422,0.347302,0.0,25.087227,0.0,0
Stat. Ensemble,0.494118,0.242341,0.0,690.615291,0.0,11
AutoARIMA,0.456055,0.233708,0.0,191.520785,0.0,10


In [None]:
# Show which covid tasks are being removed
covid_tasks = all_results[
    all_results["task_name"].str.contains("covid", case=False, na=False)
]["task_name"].unique()
print(f"Removing {len(covid_tasks)} covid task(s): {sorted(covid_tasks)}")

no_covid = all_results[
    ~all_results["task_name"].str.contains("covid", case=False, na=False)
]
lb_no_covid = leaderboard(
    no_covid, missing_strategy="drop", baseline_model="Seasonal Naive"
)
lb_no_covid


In [None]:
# Compare rank and skill_score changes
compare = (
    lb[["skill_score"]]
    .rename(columns={"skill_score": "skill_score_full"})
    .join(
        lb_no_covid[["skill_score"]].rename(
            columns={"skill_score": "skill_score_no_covid"}
        )
    )
)
compare["rank_full"] = lb["skill_score"].rank(ascending=False).astype(int)
compare["rank_no_covid"] = lb_no_covid["skill_score"].rank(ascending=False).astype(int)
compare["rank_change"] = compare["rank_full"] - compare["rank_no_covid"]
compare["skill_score_delta"] = (
    compare["skill_score_no_covid"] - compare["skill_score_full"]
)
compare.sort_values("rank_no_covid")
