In [1]:
import pandas as pd
from fev.analysis import leaderboard


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_URL = "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/fev_bench/results/"

MODELS = [
    "autoarima",
    "autoets",
    "autotheta",
    "catboost",
    "chronos-2",
    "chronos-bolt",
    "drift",
    "lightgbm",
    "moirai-2_0",
    "naive",
    "seasonal_naive",
    "stat_ensemble",
    "sundial-base",
    "tabpfn-ts",
    "timesfm-2_5",
    "tirex",
    "toto-1_0",
]

all_dfs = []
for model in MODELS:
    url = f"{BASE_URL}{model}.csv"
    try:
        df = pd.read_csv(url)
        all_dfs.append(df)
        print(f"Loaded {model} ({len(df)} rows)")
    except Exception as e:
        print(f"FAILED {model}: {e}")

all_results = pd.concat(all_dfs, ignore_index=True)
print(
    f"\nTotal rows: {len(all_results)}, models: {all_results['model_name'].nunique()}"
)


Loaded autoarima (90 rows)
Loaded autoets (97 rows)
Loaded autotheta (100 rows)
Loaded catboost (100 rows)
Loaded chronos-2 (100 rows)
Loaded chronos-bolt (100 rows)
Loaded drift (100 rows)
Loaded lightgbm (100 rows)
Loaded moirai-2_0 (100 rows)
Loaded naive (100 rows)
Loaded seasonal_naive (100 rows)
Loaded stat_ensemble (89 rows)
Loaded sundial-base (100 rows)
Loaded tabpfn-ts (98 rows)
Loaded timesfm-2_5 (100 rows)
Loaded tirex (100 rows)
Loaded toto-1_0 (100 rows)

Total rows: 1674, models: 17


In [5]:
lb = leaderboard(all_results, missing_strategy="drop", baseline_model="Seasonal Naive")
lb


86 tasks left after removing failures


Unnamed: 0_level_0,win_rate,skill_score,median_training_time_s,median_inference_time_s,training_corpus_overlap,num_failures
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chronos-2,0.905523,0.482385,0.0,1.507381,0.0,0
TiRex,0.829215,0.436617,0.0,1.079862,0.0,0
TimesFM-2.5,0.81468,0.468996,0.0,6.964178,0.093023,0
Toto-1.0,0.720203,0.417651,0.0,75.566255,0.069767,0
Moirai-2.0,0.681686,0.412298,0.0,2.026198,0.255814,0
TabPFN-TS,0.668605,0.416681,0.0,285.794899,0.0,2
Chronos-Bolt,0.667878,0.402522,0.0,0.92386,0.0,0
Stat. Ensemble,0.484012,0.242431,0.0,689.223072,0.0,11
Sundial-Base,0.476017,0.344447,0.0,26.813575,0.0,0
AutoARIMA,0.4375,0.233493,0.0,172.388484,0.0,10


In [None]:
# Show which covid tasks are being removed
covid_tasks = all_results[
    all_results["task_name"].str.contains("covid", case=False, na=False)
]["task_name"].unique()
print(f"Removing {len(covid_tasks)} covid task(s): {sorted(covid_tasks)}")

no_covid = all_results[
    ~all_results["task_name"].str.contains("covid", case=False, na=False)
]
lb_no_covid = leaderboard(
    no_covid, missing_strategy="drop", baseline_model="Seasonal Naive"
)
lb_no_covid


Removing 6 covid task(s): ['uk_covid_nation_1D/cumulative', 'uk_covid_nation_1D/new', 'uk_covid_nation_1W/cumulative', 'uk_covid_nation_1W/new', 'uk_covid_utla_1D/new', 'uk_covid_utla_1W/cumulative']
80 tasks left after removing failures


Unnamed: 0_level_0,win_rate,skill_score,median_training_time_s,median_inference_time_s,training_corpus_overlap,num_failures
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chronos-2,0.920312,0.487936,0.0,1.903196,0.0,0
TiRex,0.845313,0.439874,0.0,1.17966,0.0,0
TimesFM-2.5,0.822656,0.473911,0.0,11.980575,0.1,0
Toto-1.0,0.717969,0.417456,0.0,81.847402,0.075,0
Moirai-2.0,0.678906,0.411496,0.0,2.336805,0.275,0
Chronos-Bolt,0.66875,0.402993,0.0,0.92386,0.0,0
TabPFN-TS,0.667969,0.42042,0.0,295.423591,0.0,2
Stat. Ensemble,0.485938,0.23304,0.0,689.223072,0.0,11
Sundial-Base,0.476562,0.352436,0.0,30.892611,0.0,0
AutoARIMA,0.453125,0.251558,0.0,172.388484,0.0,10


In [9]:
# Compare rank and skill_score changes
compare = (
    lb[["skill_score"]]
    .rename(columns={"skill_score": "skill_score_full"})
    .join(
        lb_no_covid[["skill_score"]].rename(
            columns={"skill_score": "skill_score_no_covid"}
        )
    )
)
compare["rank_full"] = lb["skill_score"].rank(ascending=False).astype(int)
compare["rank_no_covid"] = lb_no_covid["skill_score"].rank(ascending=False).astype(int)
compare["rank_change"] = compare["rank_full"] - compare["rank_no_covid"]
compare["skill_score_delta"] = (
    compare["skill_score_no_covid"] - compare["skill_score_full"]
)
compare.sort_values("rank_no_covid")


Unnamed: 0_level_0,skill_score_full,skill_score_no_covid,rank_full,rank_no_covid,rank_change,skill_score_delta
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chronos-2,0.482385,0.487936,1,1,0,0.005551
TimesFM-2.5,0.468996,0.473911,2,2,0,0.004915
TiRex,0.436617,0.439874,3,3,0,0.003257
TabPFN-TS,0.416681,0.42042,5,4,1,0.003739
Toto-1.0,0.417651,0.417456,4,5,-1,-0.000195
Moirai-2.0,0.412298,0.411496,6,6,0,-0.000802
Chronos-Bolt,0.402522,0.402993,7,7,0,0.000471
Sundial-Base,0.344447,0.352436,8,8,0,0.007989
AutoARIMA,0.233493,0.251558,12,9,3,0.018065
CatBoost (Recursive),0.247776,0.239527,9,10,-1,-0.008249
