In [1]:
import pandas as pd


In [2]:
# Base URL for model results
base_url = "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/fev_bench/results/"

# List of all models based on the benchmark results
models = [
    "autoarima",
    "autoets",
    "autotheta",
    "catboost",
    "chronos-2",
    "chronos-bolt",
    "drift",
    "lightgbm",
    "moirai-2_0",
    "naive",
    "seasonal_naive",
    "stat_ensemble",
    "sundial-base",
    "tabpfn-ts",
    "timesfm-2_5",
    "tirex",
    "toto-1_0",
]


# Load all model dataframes
all_dfs = []
for model in models:
    try:
        url = f"{base_url}{model}.csv"
        model_df = pd.read_csv(url)
        all_dfs.append(model_df)
        # print(f"Loaded {model}")
    except Exception as e:
        print(f"Could not load {model}: {e}")

# Concatenate all dataframes
all_models_df = pd.concat(all_dfs, ignore_index=True)

# autoar_df = pd.read_csv("autoar_full.csv")
# all_models_df = pd.concat([all_models_df, autoar_df], ignore_index=True)

# Pivot to show models as columns and datasets as rows for MASE metric
mase_pivot = all_models_df.set_index(["task_name", "model_name"]).MASE.unstack()
mase_pivot


model_name,AutoARIMA,AutoETS,AutoTheta,CatBoost (Recursive),Chronos-2,Chronos-Bolt,Drift,LightGBM (Recursive),Moirai-2.0,Naive,Seasonal Naive,Stat. Ensemble,Sundial-Base,TabPFN-TS,TiRex,TimesFM-2.5,Toto-1.0
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
ETT_15T,,1.429346,0.802182,0.825180,0.694858,0.703345,1.415088,0.872573,0.711906,1.367085,0.916890,,0.713893,0.762542,0.718797,0.729531,0.757774
ETT_1D,1.431189,1.437554,1.448520,1.442078,1.341377,1.346105,1.493071,1.477080,1.380091,1.461523,1.491547,1.412243,1.417425,1.499178,1.326809,1.361107,1.367949
ETT_1H,1.261615,1.602095,1.284691,1.288503,1.125959,1.126722,1.782332,1.288659,1.124004,1.718414,1.322716,1.251855,1.143929,1.177375,1.117793,1.123907,1.112898
ETT_1W,2.747149,2.639318,2.836819,3.366743,2.698527,2.626054,2.781878,3.002833,2.685136,2.620119,2.620119,2.687436,2.715395,2.809512,2.620382,2.621533,2.638848
LOOP_SEATTLE_1D,0.988868,1.001671,1.022273,1.162785,0.959951,0.985932,2.373790,1.181640,0.975795,2.104690,1.175380,0.998320,0.984767,0.962222,0.969123,0.950950,1.026765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
us_consumption_1Y,4.258744,4.757438,5.619694,4.566710,4.565160,5.042649,5.515665,4.463984,5.823779,7.359485,7.359485,4.453579,6.480739,5.055184,4.554272,4.801082,4.799032
walmart,1.247382,1.722680,1.417878,1.317811,0.816740,0.967098,1.749125,1.245685,1.055917,1.524093,1.524093,1.356389,0.984219,0.831784,0.886230,0.861482,1.125762
world_co2_emissions,3.306235,3.299070,3.229452,3.195113,3.253441,3.372539,3.198378,3.185509,3.494639,3.697822,3.697822,3.176249,4.069429,3.270270,3.218939,3.434216,3.242603
world_life_expectancy,1.497425,1.535599,1.712455,1.484711,1.450189,1.626441,1.842012,1.497442,2.147670,2.253776,2.253776,1.540353,1.798777,1.347889,1.338548,1.401101,2.074207


In [3]:
mase_pivot.idxmin(axis=1).value_counts()

Chronos-2               31
TimesFM-2.5             21
Toto-1.0                17
TabPFN-TS                8
TiRex                    7
Stat. Ensemble           3
Sundial-Base             2
Moirai-2.0               2
CatBoost (Recursive)     2
LightGBM (Recursive)     2
AutoARIMA                2
Naive                    1
AutoETS                  1
Drift                    1
Name: count, dtype: int64

In [4]:
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import umap

# Create a pivot table: tasks as rows, models as columns
task_model_matrix = all_models_df.pivot_table(
    index="task_name", columns="model_name", values="MASE", aggfunc="first"
)

# Fill missing values with the mean of each task
task_model_matrix_filled = task_model_matrix.fillna(
    task_model_matrix.mean(axis=1).mean()
)

# Filter out tasks with extremely high average MASE (outliers)
mase_threshold = 100.0  # Adjust this threshold as needed
task_avg_mase = task_model_matrix_filled.mean(axis=1)
tasks_to_keep = task_avg_mase[task_avg_mase <= mase_threshold].index
task_model_filtered = task_model_matrix_filled.loc[tasks_to_keep]

print(f"Original matrix shape: {task_model_matrix_filled.shape}")
print(f"Filtered matrix shape: {task_model_filtered.shape}")
print(
    f"Removed {len(task_model_matrix_filled) - len(task_model_filtered)} tasks with avg MASE > {mase_threshold}"
)
print(
    f"Tasks removed: {list(task_model_matrix_filled.index.difference(tasks_to_keep))}"
)

# Standardize the data before UMAP
scaler = StandardScaler()
task_model_scaled = scaler.fit_transform(task_model_filtered.values)

# Apply UMAP
reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
umap_coords = reducer.fit_transform(task_model_scaled)

print(f"\nUMAP embedding complete")

# For each task, find the best and worst performing models
best_models = task_model_filtered.idxmin(axis=1)
worst_models = task_model_filtered.idxmax(axis=1)
best_mase = task_model_filtered.min(axis=1)
worst_mase = task_model_filtered.max(axis=1)
avg_mase = task_model_filtered.mean(axis=1)

# Categorize models into two groups: Foundation vs Statistical/Tree
foundation_models = {
    "Chronos-2",
    "Chronos-Bolt",
    "Moirai-2.0",
    "TimesFM-2.5",
    "Toto-1.0",
    "TiRex",
    "Sundial-Base",
    "TabPFN-TS",
}
statistical_tree_models = {
    "AutoARIMA",
    "AutoETS",
    "AutoTheta",
    "Drift",
    "Naive",
    "Seasonal Naive",
    "Stat. Ensemble",
    "CatBoost (Recursive)",
    "LightGBM (Recursive)",
}


def categorize_model(model_name):
    if model_name in foundation_models:
        return "Foundation"
    elif model_name in statistical_tree_models:
        return "Statistical/Tree"
    else:
        return "Other"


# For each task, find the best model from each category
def get_best_per_category(row):
    foundation_scores = {m: row[m] for m in foundation_models if m in row.index}
    stat_tree_scores = {m: row[m] for m in statistical_tree_models if m in row.index}

    best_foundation = (
        min(foundation_scores.values()) if foundation_scores else float("inf")
    )
    best_stat_tree = (
        min(stat_tree_scores.values()) if stat_tree_scores else float("inf")
    )

    return best_foundation, best_stat_tree


best_per_category = task_model_filtered.apply(get_best_per_category, axis=1)
best_foundation_mase = best_per_category.apply(lambda x: x[0])
best_stat_tree_mase = best_per_category.apply(lambda x: x[1])

# Calculate the competitive advantage:
# Positive = Foundation wins, Negative = Statistical/Tree wins
# Use log ratio to handle the scale better
advantage = np.log(best_stat_tree_mase / best_foundation_mase)

# Determine overall winner for each task
overall_winner = np.where(
    best_foundation_mase < best_stat_tree_mase, "Foundation", "Statistical/Tree"
)

best_model_category = best_models.apply(categorize_model)


# Get top 3 and bottom 3 models for each task
def get_top_bottom_models(row):
    sorted_models = row.sort_values()
    top3 = ", ".join([f"{m} ({v:.2f})" for m, v in sorted_models.head(3).items()])
    bottom3 = ", ".join([f"{m} ({v:.2f})" for m, v in sorted_models.tail(3).items()])
    return top3, bottom3


top_bottom = task_model_filtered.apply(get_top_bottom_models, axis=1)

# Create a dataframe for plotting
plot_df = pd.DataFrame(
    {
        "UMAP1": umap_coords[:, 0],
        "UMAP2": umap_coords[:, 1],
        "Task": task_model_filtered.index,
        "Best Model": best_models.values,
        "Winner": overall_winner,
        "Advantage": advantage.values,
        "Best Foundation MASE": best_foundation_mase.values,
        "Best Stat/Tree MASE": best_stat_tree_mase.values,
        "Best Overall MASE": best_mase.values,
        "Worst Model": worst_models.values,
        "Worst MASE": worst_mase.values,
        "Avg MASE": avg_mase.values,
        "Top 3": [t[0] for t in top_bottom],
        "Bottom 3": [t[1] for t in top_bottom],
    }
)

# Count by winner
winner_counts = plot_df["Winner"].value_counts()
print("\nBest model category:")
for cat, count in winner_counts.items():
    print(f"  {cat}: {count} tasks ({count / len(plot_df) * 100:.1f}%)")

# Print statistics on advantage
print(f"\nCompetitive Advantage (log ratio):")
print(f"  Mean: {advantage.mean():.3f}")
print(f"  Median: {advantage.median():.3f}")
print(f"  Min: {advantage.min():.3f} (Stat/Tree dominates)")
print(f"  Max: {advantage.max():.3f} (Foundation dominates)")
print(f"  10th percentile: {np.percentile(advantage, 10):.3f}")
print(f"  90th percentile: {np.percentile(advantage, 90):.3f}")
print(
    f"  Close competitions (|advantage| < 0.1): {(np.abs(advantage) < 0.1).sum()} tasks"
)


# Convert log ratio to percentage improvement
# Positive advantage = Foundation wins, Negative = Stat/Tree wins
def categorize_advantage(adv):
    # Convert to percentage improvement
    if adv > 0:  # Foundation wins
        pct_improvement = (np.exp(adv) - 1) * 100
        if pct_improvement < 1:
            return "Foundation <1%"
        elif pct_improvement < 5:
            return "Foundation 1-5%"
        elif pct_improvement < 15:
            return "Foundation 5-15%"
        elif pct_improvement < 30:
            return "Foundation 15-30%"
        else:
            return "Foundation >30%"
    else:  # Stat/Tree wins
        pct_improvement = (np.exp(-adv) - 1) * 100
        if pct_improvement < 1:
            return "Stat/Tree <1%"
        elif pct_improvement < 5:
            return "Stat/Tree 1-5%"
        elif pct_improvement < 15:
            return "Stat/Tree 5-15%"
        elif pct_improvement < 30:
            return "Stat/Tree 15-30%"
        else:
            return "Stat/Tree >30%"


advantage_category = advantage.apply(categorize_advantage)

# Add to dataframe
plot_df["Advantage_Category"] = advantage_category.values

# Count by category
print("\nAdvantage by category:")
for cat in sorted(plot_df["Advantage_Category"].unique()):
    count = (plot_df["Advantage_Category"] == cat).sum()
    print(f"  {cat}: {count} tasks ({count / len(plot_df) * 100:.1f}%)")

# Define color mapping - diverging from blue (Stat/Tree) to white to red (Foundation)
color_discrete_map = {
    "Stat/Tree >30%": "#08519C",  # Dark blue
    "Stat/Tree 15-30%": "#3182BD",  # Medium blue
    "Stat/Tree 5-15%": "#6BAED6",  # Light blue
    "Stat/Tree 1-5%": "#BDD7E7",  # Very light blue
    "Stat/Tree <1%": "#FFFFFF",  # White
    "Foundation <1%": "#FFFFFF",  # White
    "Foundation 1-5%": "#FCAE91",  # Very light red
    "Foundation 5-15%": "#FB6A4A",  # Light red
    "Foundation 15-30%": "#DE2D26",  # Medium red
    "Foundation >30%": "#A50F15",  # Dark red
}

# Order for legend
category_order = [
    "Stat/Tree >30%",
    "Stat/Tree 15-30%",
    "Stat/Tree 5-15%",
    "Stat/Tree 1-5%",
    "Stat/Tree <1%",
    "Foundation <1%",
    "Foundation 1-5%",
    "Foundation 5-15%",
    "Foundation 15-30%",
    "Foundation >30%",
]

# Create interactive scatter plot with discrete color bands
fig = px.scatter(
    plot_df,
    x="UMAP1",
    y="UMAP2",
    text="Task",
    color="Advantage_Category",
    hover_name="Task",
    hover_data={
        "Task": False,
        "Winner": True,
        "Best Model": True,
        "Advantage": ":.3f",
        "Advantage_Category": True,
        "Best Foundation MASE": ":.2f",
        "Best Stat/Tree MASE": ":.2f",
        "Best Overall MASE": ":.2f",
        "Avg MASE": ":.2f",
        "Top 3": True,
        "Bottom 3": True,
        "UMAP1": False,
        "UMAP2": False,
    },
    title=f"UMAP of Tasks: Foundation vs Statistical/Tree Models (filtered: avg MASE â‰¤ {mase_threshold})",
    width=1400,
    height=800,
    color_discrete_map=color_discrete_map,
    category_orders={"Advantage_Category": category_order},
)

# Update traces to show labels and customize appearance
fig.update_traces(
    textposition="top center",
    textfont_size=6,
    marker=dict(size=10, opacity=0.7, line=dict(width=1, color="white")),
)

# Update layout
fig.update_layout(
    font=dict(size=10),
    hoverlabel=dict(font_size=11),
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2",
    legend=dict(
        title=dict(text="Percentage<br>Improvement"),
        yanchor="middle",
        y=0.5,
        xanchor="left",
        x=1.01,
    ),
)

fig.show()

  from .autonotebook import tqdm as notebook_tqdm


Original matrix shape: (100, 17)
Filtered matrix shape: (100, 17)
Removed 0 tasks with avg MASE > 100.0
Tasks removed: []


  warn(



UMAP embedding complete

Best model category:
  Foundation: 88 tasks (88.0%)
  Statistical/Tree: 12 tasks (12.0%)

Competitive Advantage (log ratio):
  Mean: 0.189
  Median: 0.090
  Min: -0.224 (Stat/Tree dominates)
  Max: 2.182 (Foundation dominates)
  10th percentile: -0.014
  90th percentile: 0.409
  Close competitions (|advantage| < 0.1): 48 tasks

Advantage by category:
  Foundation 1-5%: 13 tasks (13.0%)
  Foundation 15-30%: 18 tasks (18.0%)
  Foundation 5-15%: 32 tasks (32.0%)
  Foundation <1%: 5 tasks (5.0%)
  Foundation >30%: 20 tasks (20.0%)
  Stat/Tree 1-5%: 5 tasks (5.0%)
  Stat/Tree 15-30%: 4 tasks (4.0%)
  Stat/Tree 5-15%: 2 tasks (2.0%)
  Stat/Tree <1%: 1 tasks (1.0%)


In [5]:
# Check which tasks have the highest average MASE values
task_avg_mase = task_model_matrix_filled.mean(axis=1).sort_values(ascending=False)
print("Top 10 tasks with highest average MASE:")
print(task_avg_mase.head(10))
print(f"\nMedian MASE: {task_avg_mase.median():.2f}")
print(f"Mean MASE: {task_avg_mase.mean():.2f}")

Top 10 tasks with highest average MASE:
task_name
uk_covid_utla_1W/cumulative      20.652829
uk_covid_nation_1D/cumulative    13.158463
fred_md_2025/macro                6.881223
uk_covid_nation_1W/new            5.865338
us_consumption_1Y                 5.263334
uk_covid_utla_1D/new              5.232381
fred_md_2025/cee                  5.208995
redset_15T                        5.063269
uk_covid_nation_1W/cumulative     4.926245
fred_qd_2025/macro                4.557331
dtype: float64

Median MASE: 1.21
Mean MASE: 1.94


Tasks to make up foundation fails benchmark

In [None]:
# Tasks where foundation models don't win or win by <1%
non_dominant = plot_df[
    plot_df["Advantage_Category"].str.contains("<1%|Stat/Tree")
].copy()
non_dominant_sorted = non_dominant.sort_values("Advantage")

non_dominant_sorted.Task.values


array(['uk_covid_nation_1W/cumulative', 'uk_covid_utla_1W/cumulative',
       'world_tourism', 'uk_covid_nation_1D/cumulative',
       'us_consumption_1Y', 'rohlik_orders_1W',
       'favorita_transactions_1W', 'gvar', 'rohlik_orders_1D',
       'us_consumption_1M', 'world_co2_emissions', 'ETT_1W', 'ecdc_ili',
       'ercot_1M', 'hospital_admissions_1W', 'hospital_admissions_1D',
       'hierarchical_sales_1D'], dtype=object)