# Run an initial analysis of results and produce aggregated results files

In [2]:
import pandas as pd
import itertools
import matplotlib.pyplot as plt

pd.options.display.max_rows = 400
from metadata_utils import get_tuned_alg_perf

In [3]:
from pathlib import Path
metadata_folder = Path("../TabSurvey")

metadataset_df = pd.read_csv(metadata_folder / "metadataset.csv")
errors_df = pd.read_csv(metadata_folder / "metadataset_errors.csv")


In [4]:
# keep only binary and classification datasets
print(metadataset_df["target_type"].value_counts())

metadataset_df = metadataset_df.loc[metadataset_df["target_type"].isin(["binary", "classification"]), :]

binary            566550
classification    417570
regression        102790
Name: target_type, dtype: int64


In [5]:
# check that all dataset-alg pairs have results for all 10 folds
print(metadataset_df.groupby(["alg_name", "dataset_name"]).agg({"dataset_fold_id": lambda x: len(set(x))}).sort_values("dataset_fold_id"))

                                              dataset_fold_id
alg_name    dataset_name                                     
CatBoost    openml__APSFailure__168868                     10
TabNet      openml__higgs__146606                          10
            openml__hill-valley__145847                    10
            openml__house_16H__3686                        10
            openml__ilpd__9971                             10
...                                                       ...
MLP         openml__MiniBooNE__168335                      10
            openml__PhishingWebsites__14952                10
            openml__Satellite__167211                      10
            openml__Devnagari-Script__167121               10
rtdl_ResNet openml__yeast__145793                          10

[3269 rows x 1 columns]


### Get dataset-alg pairs with no results

In [6]:
# first get num hparam samples for each alg-dataset pair
hparam_counts = metadataset_df.groupby(["alg_name", "dataset_name"]).agg({"hparam_source": lambda x: len(set(x))}).reset_index()
hparam_counts.columns = ["alg_name", "dataset_name", "num_hparam_samples"] 
print(hparam_counts)

         alg_name                           dataset_name  num_hparam_samples
0        CatBoost             openml__APSFailure__168868                  30
1        CatBoost  openml__Amazon_employee_access__34539                  30
2        CatBoost             openml__Australian__146818                  30
3        CatBoost              openml__Bioresponse__9910                  30
4        CatBoost          openml__Census-Income__168340                  30
...           ...                                    ...                 ...
3264  rtdl_ResNet         openml__walking-activity__9945                   7
3265  rtdl_ResNet    openml__wall-robot-navigation__9960                  30
3266  rtdl_ResNet                     openml__wdbc__9946                  30
3267  rtdl_ResNet                   openml__wilt__146820                  30
3268  rtdl_ResNet                  openml__yeast__145793                  30

[3269 rows x 3 columns]


In [7]:
# for caught exceptions, add an indicator for certain errors
# errors_df.head()
errors_df.loc[:, "has_exception"] = ~errors_df["exception"].isna()
errors_df.loc[:, "timeout_excep"] = errors_df["exception"].str.contains("TimeoutException")
errors_df.loc[:, "CUDA_memory_excep"] = errors_df["exception"].str.contains("CUDA out of memory")
errors_df.loc[:, "CUDA_config_excep"] = errors_df["exception"].str.contains("CUDA error: invalid configuration argument")
errors_df.loc[:, "allocate_memory_excep"] = errors_df["exception"].str.contains("Cannot allocate memory")

# aggregate this by alg and dataset
errors_df_agg = errors_df.groupby(["alg_name", "dataset_name"]).agg(
    {
        "has_exception": any,
        "timeout_excep": any,
        "CUDA_memory_excep": any,
        "CUDA_config_excep": any,
        "allocate_memory_excep": any,
    }
)

In [8]:
# get lists of all datasets and all algs
all_datasets = list(hparam_counts["dataset_name"].unique())
all_algs = list(hparam_counts["alg_name"].unique())

print(f"num datasets: {len(all_datasets)}")
print(f"num algs: {len(all_algs)}")

# now create a new df with each of these pairs
d_tmp = []
a_tmp = []
import itertools
for d, a in itertools.product(all_datasets, all_algs):
    d_tmp.append(d)
    a_tmp.append(a)

tmp_df = pd.DataFrame(
    {
        "alg_name": a_tmp,
        "dataset_name": d_tmp,
    }
)

hparam_counts = hparam_counts.merge(tmp_df, on=["alg_name", "dataset_name"], how="right")
hparam_counts.loc[hparam_counts["num_hparam_samples"].isna(), "num_hparam_samples"] = 0

hparam_counts = hparam_counts.merge(errors_df_agg, on=["alg_name", "dataset_name"], how="left")


num datasets: 176
num algs: 22


In [9]:
hparam_counts[hparam_counts["num_hparam_samples"] == 0]["alg_name"].value_counts()

TabPFNModel           113
NAM                    96
DeepFM                 86
TabTransformer         52
NODE                   38
SAINT                  38
rtdl_FTTransformer     37
SVM                    33
DANet                  29
VIME                   13
STG                    12
CatBoost               11
LightGBM               11
KNN                     9
TabNet                  8
LinearModel             8
RandomForest            3
rtdl_ResNet             2
XGBoost                 2
DecisionTree            1
MLP                     1
Name: alg_name, dtype: int64

In [10]:
# merge in target type

target_types = metadataset_df.loc[:, ["dataset_name", "target_type"]].drop_duplicates()

hparam_counts = hparam_counts.merge(target_types, on="dataset_name", how="left")
hparam_counts.head()

Unnamed: 0,alg_name,dataset_name,num_hparam_samples,has_exception,timeout_excep,CUDA_memory_excep,CUDA_config_excep,allocate_memory_excep,target_type
0,CatBoost,openml__APSFailure__168868,30.0,,,,,,binary
1,DANet,openml__APSFailure__168868,0.0,True,True,False,False,False,binary
2,DecisionTree,openml__APSFailure__168868,30.0,,,,,,binary
3,DeepFM,openml__APSFailure__168868,30.0,,,,,,binary
4,KNN,openml__APSFailure__168868,7.0,True,True,False,False,False,binary


In [11]:
hparam_counts.to_csv("./results/result_summary.csv", index=False)
hparam_counts.loc[hparam_counts["num_hparam_samples"] == 0, :].to_csv("./results/failed_experiments.csv", index=False)

#### Look at specific failed experiments

In [12]:
failed_expts = pd.read_csv("./results/failed_experiments.csv")

In [13]:
print(len(failed_expts[(failed_expts["alg_name"] == "SAINT")]))
failed_expts[(failed_expts["alg_name"] == "SAINT")] # & (failed_expts["timeout_excep"].astype(str) != "True")]

38


Unnamed: 0,alg_name,dataset_name,num_hparam_samples,has_exception,timeout_excep,CUDA_memory_excep,CUDA_config_excep,allocate_memory_excep,target_type
3,SAINT,openml__APSFailure__168868,0.0,True,True,False,False,False,binary
11,SAINT,openml__Bioresponse__9910,0.0,,,,,,binary
17,SAINT,openml__Census-Income__168340,0.0,True,True,False,False,False,binary
32,SAINT,openml__Fashion-MNIST__146825,0.0,True,True,False,False,False,classification
42,SAINT,openml__Internet-Advertisements__167125,0.0,,,,,,binary
60,SAINT,openml__MiniBooNE__168335,0.0,True,True,False,False,False,binary
76,SAINT,openml__airlines__189354,0.0,True,True,False,False,False,binary
90,SAINT,openml__albert__189356,0.0,True,True,False,False,False,binary
125,SAINT,openml__bank-marketing__9899,0.0,,,,,,binary
152,SAINT,openml__christine__168908,0.0,,,,,,binary


These SAINT failures all occurred on A100s, so we are done.

In [49]:
print(len(failed_expts[(failed_expts["alg_name"] == "NODE")]))
failed_expts[(failed_expts["alg_name"] == "NODE")] # & (failed_expts["timeout_excep"].astype(str) != "True")]

40


Unnamed: 0,alg_name,dataset_name,num_hparam_samples,has_exception,timeout_excep,CUDA_memory_excep,CUDA_config_excep,allocate_memory_excep,target_type
2,NODE,openml__APSFailure__168868,0.0,True,True,False,False,False,binary
6,NODE,openml__Amazon_employee_access__34539,0.0,True,True,False,False,False,binary
45,NODE,openml__Fashion-MNIST__146825,0.0,True,True,False,False,False,classification
52,NODE,openml__GesturePhaseSegmentationProcessed__14969,0.0,True,True,False,False,False,classification
62,NODE,openml__JapaneseVowels__3510,0.0,True,True,False,False,False,classification
73,NODE,openml__MiniBooNE__168335,0.0,True,True,False,False,False,binary
116,NODE,openml__albert__189356,0.0,True,True,False,False,False,binary
127,NODE,openml__aloi__12732,0.0,True,True,False,False,False,regression
152,NODE,openml__artificial-characters__14964,0.0,True,True,False,False,False,classification
196,NODE,openml__chess__3952,0.0,True,True,True,False,False,classification


In [15]:
for d in failed_expts[(failed_expts["alg_name"] == "NODE")]["dataset_name"].values:
    print(d)

openml__APSFailure__168868
openml__Amazon_employee_access__34539
openml__Fashion-MNIST__146825
openml__GesturePhaseSegmentationProcessed__14969
openml__JapaneseVowels__3510
openml__MiniBooNE__168335
openml__albert__189356
openml__artificial-characters__14964
openml__chess__3952
openml__dilbert__168909
openml__fabert__168910
openml__first-order-theorem-proving__9985
openml__gas-drift-different-concentrations__9987
openml__gas-drift__9986
openml__har__14970
openml__jannis__168330
openml__jungle_chess_2pcs_raw_endgame_complete__167119
openml__kropt__2076
openml__ldpa__9974
openml__letter__6
openml__mnist_784__3573
openml__nursery__9892
openml__one-hundred-plants-texture__9956
openml__optdigits__28
openml__poker-hand__9890
openml__satimage__2074
openml__texture__125922
openml__volkert__168331
openml__walking-activity__9945
openml__isolet__3481
openml__pendigits__32
openml__robert__168332
openml__CIFAR_10__167124
openml__Devnagari-Script__167121
openml__covertype__7593
openml__helena__16832

**Note:** it looks like most NODE experiments failed due to runtime. In other words, for these datasets NODE did not complete a single train/test cycle for a single hyperparameter set in 10 hours. There are two CUDA memory issues here for (I think) some of the largest datasets we have. This is expected.

In [50]:
print(len(failed_expts[(failed_expts["alg_name"] == "DeepFM")]))
failed_expts[(failed_expts["alg_name"] == "DeepFM")] # & (failed_expts["timeout_excep"].astype(str) != "True")]

88


Unnamed: 0,alg_name,dataset_name,num_hparam_samples,has_exception,timeout_excep,CUDA_memory_excep,CUDA_config_excep,allocate_memory_excep,target_type
23,DeepFM,openml__Census-Income__168340,0.0,True,True,False,False,False,binary
40,DeepFM,openml__Fashion-MNIST__146825,0.0,,,,,,classification
50,DeepFM,openml__GesturePhaseSegmentationProcessed__14969,0.0,,,,,,classification
54,DeepFM,openml__Internet-Advertisements__167125,0.0,True,True,False,False,False,binary
60,DeepFM,openml__JapaneseVowels__3510,0.0,,,,,,classification
64,DeepFM,openml__LED-display-domain-7digit__125921,0.0,,,,,,classification
69,DeepFM,openml__MiceProtein__146800,0.0,,,,,,classification
101,DeepFM,openml__airlines__189354,0.0,True,True,False,False,False,binary
112,DeepFM,openml__albert__189356,0.0,True,True,False,False,False,binary
137,DeepFM,openml__analcatdata_authorship__3549,0.0,,,,,,classification


These DeepFM issues look like timeout and/or CUDA memory issues. we could easily just rerun with A100s.

In [53]:
print(len(failed_expts[(failed_expts["alg_name"] == "rtdl_FTTransformer") & (failed_expts["target_type"] != "regression")]))
failed_expts[(failed_expts["alg_name"] == " ") & (failed_expts["target_type"] != "regression")]

37


Unnamed: 0,alg_name,dataset_name,num_hparam_samples,has_exception,timeout_excep,CUDA_memory_excep,CUDA_config_excep,allocate_memory_excep,target_type
5,rtdl_FTTransformer,openml__APSFailure__168868,0.0,True,False,True,False,False,binary
21,rtdl_FTTransformer,openml__Bioresponse__9910,0.0,True,False,True,False,False,binary
31,rtdl_FTTransformer,openml__Census-Income__168340,0.0,True,False,True,False,False,binary
49,rtdl_FTTransformer,openml__Fashion-MNIST__146825,0.0,True,False,True,False,False,classification
59,rtdl_FTTransformer,openml__Internet-Advertisements__167125,0.0,True,False,True,False,False,binary
78,rtdl_FTTransformer,openml__MiniBooNE__168335,0.0,True,False,True,False,False,binary
110,rtdl_FTTransformer,openml__airlines__189354,0.0,True,False,True,False,False,binary
124,rtdl_FTTransformer,openml__albert__189356,0.0,True,False,True,False,False,binary
203,rtdl_FTTransformer,openml__christine__168908,0.0,True,False,True,False,False,binary
228,rtdl_FTTransformer,openml__cnae-9__9981,0.0,True,False,True,False,False,classification


There are 37 failed experiments, all apparently due to CUDA memory issues...

In [54]:
print(len(failed_expts[(failed_expts["alg_name"] == "DANet") & (failed_expts["target_type"] != "regression")]))
failed_expts[(failed_expts["alg_name"] == "DANet") & (failed_expts["target_type"] != "regression")]

29


Unnamed: 0,alg_name,dataset_name,num_hparam_samples,has_exception,timeout_excep,CUDA_memory_excep,CUDA_config_excep,allocate_memory_excep,target_type
0,DANet,openml__APSFailure__168868,0.0,True,True,False,False,False,binary
22,DANet,openml__Census-Income__168340,0.0,True,True,False,False,False,binary
39,DANet,openml__Fashion-MNIST__146825,0.0,True,True,False,False,False,classification
71,DANet,openml__MiniBooNE__168335,0.0,True,True,False,False,False,binary
98,DANet,openml__adult__7592,0.0,True,True,False,False,False,binary
100,DANet,openml__airlines__189354,0.0,True,True,False,False,False,binary
111,DANet,openml__albert__189356,0.0,True,True,False,False,False,binary
193,DANet,openml__chess__3952,0.0,True,True,False,False,False,classification
241,DANet,openml__connect-4__146195,0.0,True,True,False,False,False,classification
281,DANet,openml__electricity__219,0.0,True,True,False,False,False,binary


DANet: there are 29 failed experiments, all due to timeout (no indication of CUDA memory error)

In [55]:
print(len(failed_expts[(failed_expts["alg_name"] == "SVM") & (failed_expts["target_type"] != "regression")]))
failed_expts[(failed_expts["alg_name"] == "SVM") & (failed_expts["target_type"] != "regression")]

33


Unnamed: 0,alg_name,dataset_name,num_hparam_samples,has_exception,timeout_excep,CUDA_memory_excep,CUDA_config_excep,allocate_memory_excep,target_type
7,SVM,openml__Amazon_employee_access__34539,0.0,,,,,,binary
27,SVM,openml__Census-Income__168340,0.0,True,True,False,False,False,binary
47,SVM,openml__Fashion-MNIST__146825,0.0,True,True,False,False,False,classification
67,SVM,openml__MagicTelescope__3954,0.0,,,,,,binary
76,SVM,openml__MiniBooNE__168335,0.0,,,,,,binary
105,SVM,openml__airlines__189354,0.0,True,True,False,False,False,binary
119,SVM,openml__albert__189356,0.0,True,True,False,False,False,binary
166,SVM,openml__bank-marketing__9899,0.0,,,,,,binary
244,SVM,openml__connect-4__146195,0.0,True,True,False,False,False,classification
279,SVM,openml__eeg-eye-state__14951,0.0,,,,,,binary


skelearn's SVM classifier does not scale well to large datasets.

In [57]:
print(len(failed_expts[(failed_expts["alg_name"] == "TabTransformer") & (failed_expts["target_type"] != "regression")]))
failed_expts[(failed_expts["alg_name"] == "TabTransformer") & (failed_expts["target_type"] != "regression")]

52


Unnamed: 0,alg_name,dataset_name,num_hparam_samples,has_exception,timeout_excep,CUDA_memory_excep,CUDA_config_excep,allocate_memory_excep,target_type
9,TabTransformer,openml__Amazon_employee_access__34539,0.0,True,False,False,True,False,binary
58,TabTransformer,openml__Internet-Advertisements__167125,0.0,True,False,True,False,False,binary
66,TabTransformer,openml__LED-display-domain-7digit__125921,0.0,True,False,False,False,False,classification
81,TabTransformer,openml__PhishingWebsites__14952,0.0,True,False,False,True,False,binary
108,TabTransformer,openml__airlines__189354,0.0,True,True,True,False,False,binary
122,TabTransformer,openml__albert__189356,0.0,True,True,True,False,False,binary
139,TabTransformer,openml__analcatdata_boxing1__3540,0.0,True,False,False,True,False,binary
140,TabTransformer,openml__analcatdata_chlamydia__3739,0.0,True,False,False,True,False,binary
143,TabTransformer,openml__analcatdata_dmft__3560,0.0,True,False,False,True,False,classification
154,TabTransformer,openml__artificial-characters__14964,0.0,True,False,False,False,False,classification


## Analyze num. results per dataset and alg

In [14]:
# for each alg, for how many datasets are there results?
print(f"for each alg: number of datasets with results (out of {len(metadataset_df['dataset_name'].unique())})")
print(metadataset_df.groupby("alg_name")["dataset_name"].apply(lambda x: len(set(x))).sort_values())

print(f"for each dataset: number of algs with results (out of {len(metadataset_df['alg_name'].unique())})")
metadataset_df.groupby("dataset_name")["alg_name"].apply(lambda x: len(set(x))).sort_values()

for each alg: number of datasets with results (out of 176)
alg_name
TabPFNModel            63
NAM                    80
DeepFM                 90
TabTransformer        124
NODE                  138
SAINT                 138
rtdl_FTTransformer    139
SVM                   143
DANet                 147
VIME                  163
STG                   164
CatBoost              165
LightGBM              165
KNN                   167
LinearModel           168
TabNet                168
RandomForest          173
XGBoost               174
rtdl_ResNet           174
MLP                   175
DecisionTree          175
rtdl_MLP              176
Name: dataset_name, dtype: int64
for each dataset: number of algs with results (out of 22)


dataset_name
openml__poker-hand__9890                                   5
openml__covertype__7593                                    7
openml__albert__189356                                     8
openml__Devnagari-Script__167121                           8
openml__helena__168329                                    10
openml__CIFAR_10__167124                                  10
openml__Fashion-MNIST__146825                             11
openml__walking-activity__9945                            11
openml__airlines__189354                                  11
openml__Census-Income__168340                             12
openml__robert__168332                                    12
openml__riccardo__168338                                  12
openml__guillermo__168337                                 12
openml__mnist_784__3573                                   12
openml__ldpa__9974                                        13
openml__jannis__168330                                    14
openml__jun

## Remove datasets with few results

In [6]:
print(f"for each dataset: number of algs with results")
alg_counts = metadataset_df.groupby("dataset_name")["alg_name"].agg(lambda x: len(set(x))).sort_values()
print(alg_counts)

for each dataset: number of algs with results
dataset_name
openml__poker-hand__9890                                   5
openml__covertype__7593                                    7
openml__albert__189356                                     8
openml__Devnagari-Script__167121                           8
openml__helena__168329                                     9
openml__CIFAR_10__167124                                  10
openml__walking-activity__9945                            11
openml__airlines__189354                                  11
openml__Fashion-MNIST__146825                             11
openml__guillermo__168337                                 12
openml__riccardo__168338                                  12
openml__mnist_784__3573                                   12
openml__robert__168332                                    12
openml__Census-Income__168340                             12
openml__ldpa__9974                                        13
openml__higgs__146606     

In [7]:
keep_datasets = list(alg_counts[alg_counts >= 10].index)
drop_datasets = alg_counts[alg_counts < 10]

print(f"dropping {len(drop_datasets)} datasets:")
print(drop_datasets)

dropping 5 datasets:
dataset_name
openml__poker-hand__9890            5
openml__covertype__7593             7
openml__albert__189356              8
openml__Devnagari-Script__167121    8
openml__helena__168329              9
Name: alg_name, dtype: int64


In [8]:
print("for each alg: number of datasets with results")
dataset_counts = metadataset_df.groupby("alg_name")["dataset_name"].agg(lambda x: len(set(x))).sort_values()
print(dataset_counts)

for each alg: number of datasets with results
alg_name
TabPFNModel            63
NAM                    80
DeepFM                 90
SAINT                 106
TabTransformer        124
NODE                  138
rtdl_FTTransformer    139
SVM                   143
DANet                 147
VIME                  163
STG                   164
CatBoost              165
LightGBM              165
KNN                   167
LinearModel           168
TabNet                168
RandomForest          173
XGBoost               174
rtdl_ResNet           174
MLP                   175
DecisionTree          175
rtdl_MLP              176
Name: dataset_name, dtype: int64


We will keep all algs, regardless of how many datasets they have results for.

In [1]:
metadataset_df = metadataset_df.loc[metadataset_df["dataset_name"].isin(keep_datasets), :]

print("after removing datasets: number of datasets with results")
dataset_counts = metadataset_df.groupby("alg_name")["dataset_name"].agg(lambda x: len(set(x))).sort_values()
print(dataset_counts)

NameError: name 'metadataset_df' is not defined

# Get tuned algorithms for a given metric

Report the average & median test performance, over all folds. Note that each alg is tuned for each fold separately.

In [10]:
metric_list = [
    "Accuracy",
    "F1",
    "Log Loss",
]

obj_type_list = [
    "maximize",
    "maximize",
    "minimize",
]
result_df_dict = {}



In [11]:
# add a copy of each "default" hparam row, to treat this as a separate alg
default_rows = metadataset_df.loc[metadataset_df["hparam_source"] == "default"].copy()
default_rows.loc[:, "alg_name"] = default_rows["alg_name"].apply(lambda x: x + "_default")

# remove TabPFN and LinearModel, since these only have one hparam set
default_rows = default_rows.loc[~(default_rows["alg_name"].str.contains("TabPFNModel") | default_rows["alg_name"].str.contains("LinearModel")), :]

# append these to the metadataset
metadataset_df = pd.concat([metadataset_df, default_rows], ignore_index=True)

In [43]:
tuned_result_dfs = {}
for drop_default in [True, False]:
    for i, (metric, objective_type) in enumerate(zip(metric_list, obj_type_list)):

        test_metric_col = metric + "__test"

        if drop_default:
            df = metadataset_df.loc[~metadataset_df["alg_name"].str.contains("_default"), :].copy()
        else:
            df = metadataset_df.copy()

        tuned_alg_perf = get_tuned_alg_perf(df, metric=metric)
        # NOTE: this "tunes" each algorithm for each training fold separately. so each of the 10 folds might use different hparams.
        tuned_result_dfs[metric] = tuned_alg_perf

        ##############################
        ### STEP 1: TREAT EACH FOLD AS SEPARATE DATASET

        result_col = test_metric_col
        
        # for each dataset, find the min and max metrics over all tuned algs
        overall_bounds = tuned_alg_perf.groupby("dataset_fold_id").agg({result_col: ["min", "max"]}).reset_index()

        # rename the multiindex cols
        new_cols = []
        for c in overall_bounds.columns:
            if c[1] == "":
                new_cols.append(c[0])
            else:
                new_cols.append("_".join(c))

        overall_bounds.columns = new_cols

        tuned_alg_perf = tuned_alg_perf.merge(overall_bounds, on="dataset_fold_id", how="left")

        # add normalized metric
        tuned_alg_perf.loc[:, "normalized_" + result_col] = (tuned_alg_perf[result_col] - tuned_alg_perf[result_col + "_min"]) / (tuned_alg_perf[result_col + "_max"] - tuned_alg_perf[result_col + "_min"])

        # rank all algs for each dataset
        ascending = False if objective_type == "maximize" else True
        
        tuned_alg_perf.loc[:, f"{metric}_rank"] = tuned_alg_perf.groupby(["dataset_fold_id"])[result_col].rank(method="min", ascending=ascending).values

        # keep these cols to merge
        merge_cols = [
            "alg_name", 
            "dataset_fold_id", 
            "normalized_" + result_col,
            f"{metric}_rank",
            result_col + "_min",
            result_col + "_max"
        ]

        if i == 0:
            fold_tuned_df = tuned_alg_perf.copy()
        else:
            fold_tuned_df = fold_tuned_df.merge(tuned_alg_perf[merge_cols], on=["alg_name", "dataset_fold_id"])

        ##############################
        ### STEP 2: AVERAGE OVER FOLDS

        if i == 0:
            agg_dict = {
                test_metric_col: ["median", "mean"],
                "time__train": ["median", "mean"],
                # "dataset_name": ["count"],
            }
        else:
            agg_dict = {
                test_metric_col: ["median", "mean"],
            }

        # aggregate over folds: take the mean & median performance over each fold
        agg_tuned_alg_perf = tuned_alg_perf.groupby(["alg_name", "dataset_name"]).agg(agg_dict).reset_index()

        # rename the multiindex cols
        new_cols = []
        for c in agg_tuned_alg_perf.columns:
            if c[1] == "":
                new_cols.append(c[0])
            else:
                new_cols.append("_".join(c))

        agg_tuned_alg_perf.columns = new_cols


        # define the target metric column, we will use this value for all plots
        result_col = test_metric_col + "_mean"

        # for each dataset, find the min and max metrics over all tuned algs
        overall_bounds = agg_tuned_alg_perf.groupby("dataset_name").agg({result_col: ["min", "max"]}).reset_index()

        # adjust the lower bound to be the metric for a tuned decision tree (this is the "baseline")
        # baseline_metric = agg_tuned_alg_perf.loc[agg_tuned_alg_perf["alg_name"] == "DecisionTree"].groupby("dataset_name").agg({result_col: "max"}).reset_index()

        # baseline_metric.columns = ["dataset_name", "baseline_metric"]
        
        # rename the multiindex cols
        new_cols = []
        for c in overall_bounds.columns:
            if c[1] == "":
                new_cols.append(c[0])
            else:
                new_cols.append("_".join(c))

        overall_bounds.columns = new_cols

        
        agg_tuned_alg_perf = agg_tuned_alg_perf.merge(overall_bounds, on="dataset_name", how="left")

        # add normalized metric
        agg_tuned_alg_perf.loc[:, "normalized_" + result_col] = (agg_tuned_alg_perf[result_col] - agg_tuned_alg_perf[result_col + "_min"]) / (agg_tuned_alg_perf[result_col + "_max"] - agg_tuned_alg_perf[result_col + "_min"])

        # rank all algs for each dataset
        ascending = False if objective_type == "maximize" else True
        
        # rank according to mean performance over all folds
        # do this both with tabpfn (only for the datasets with tabpfn) and without
        agg_method = "mean"

        # 1) with tabpfn. keep only datasets where tabpfn has a result
        tabpfn_datasets = agg_tuned_alg_perf[agg_tuned_alg_perf["alg_name"] == "TabPFNModel"]["dataset_name"].unique()
        tabpfn_df = agg_tuned_alg_perf.loc[agg_tuned_alg_perf["dataset_name"].isin(tabpfn_datasets), :]

        tabpfn_df.loc[:, f"{metric}_rank_{agg_method}"]  = \
            tabpfn_df.groupby(["dataset_name"])[test_metric_col + "_" + agg_method].rank(method="min", ascending=ascending).values
        
        # 1) without tabpfn. remove all tabpfn results
        non_tabpfn_df = agg_tuned_alg_perf.loc[agg_tuned_alg_perf["alg_name"] != "TabPFNModel", :]

        non_tabpfn_df.loc[:, f"{metric}_rank_{agg_method}"]  = \
            non_tabpfn_df.groupby(["dataset_name"])[test_metric_col + "_" + agg_method].rank(method="min", ascending=ascending).values


        # keep these cols to merge
        merge_cols = [
            "alg_name", 
            "dataset_name",
            "normalized_" + result_col,
            f"{metric}_rank_mean",
            result_col + "_min",
            result_col + "_max"
        ]

        if i == 0:
            tabpfn_tuned_agg_df = tabpfn_df.copy()
            non_tabpfn_tuned_agg_df = non_tabpfn_df.copy()
        else:
            tabpfn_tuned_agg_df = tabpfn_tuned_agg_df.merge(tabpfn_df[merge_cols], on=["alg_name", "dataset_name"])
            non_tabpfn_tuned_agg_df = non_tabpfn_tuned_agg_df.merge(non_tabpfn_df[merge_cols], on=["alg_name", "dataset_name"])

    # save results

    if drop_default:
        tabpfn_agg_df_no_default = tabpfn_tuned_agg_df.copy()
        non_tabpfn_agg_df_no_default = non_tabpfn_tuned_agg_df.copy()
        tabpfn_agg_df_no_default.to_csv("./results/tuned_aggregated_results_tabpfn.csv")
        non_tabpfn_agg_df_no_default.to_csv("./results/tuned_aggregated_results_non_tabpfn.csv")

        tuned_fold_df_no_default = fold_tuned_df.copy()
        tuned_fold_df_no_default.to_csv("./results/tuned_fold_results.csv")
       
    else:
        tabpfn_agg_df_with_default = tabpfn_tuned_agg_df.copy()
        non_tabpfn_agg_df_with_default = non_tabpfn_tuned_agg_df.copy()
        tabpfn_agg_df_with_default.to_csv("./results/tuned_aggregated_results_tabpfn_with_default.csv")
        non_tabpfn_agg_df_with_default.to_csv("./results/tuned_aggregated_results_non_tabpfn_with_default.csv")

        tuned_fold_df_with_default = fold_tuned_df.copy()
        tuned_fold_df_with_default.to_csv("./results/tuned_fold_results_with_default_hparams.csv")


In [44]:
tabpfn_agg_df_no_default.head()

Unnamed: 0,alg_name,dataset_name,Accuracy__test_median,Accuracy__test_mean,time__train_median,time__train_mean,Accuracy__test_mean_min,Accuracy__test_mean_max,normalized_Accuracy__test_mean,Accuracy_rank_mean,normalized_F1__test_mean,F1_rank_mean,F1__test_mean_min,F1__test_mean_max,normalized_Log Loss__test_mean,Log Loss_rank_mean,Log Loss__test_mean_min,Log Loss__test_mean_max
0,CatBoost,openml__Australian__146818,0.869565,0.872464,1.34765,1.393643,0.711594,0.872464,1.0,1.0,1.0,1.0,0.711594,0.872464,0.0,1.0,0.3026771,0.75592
1,CatBoost,openml__LED-display-domain-7digit__125921,0.72,0.728,0.696405,1.113755,0.698,0.736,0.789474,4.0,0.740672,5.0,0.690748,0.731609,0.0,1.0,0.8274095,2.539521
2,CatBoost,openml__MiceProtein__146800,0.981481,0.980556,2.073627,3.307141,0.661111,0.998148,0.947802,8.0,0.954088,8.0,0.614618,0.998148,0.076744,10.0,0.01198685,1.380437
3,CatBoost,openml__acute-inflammations__10089,1.0,1.0,0.202706,0.199364,0.6,1.0,1.0,1.0,1.0,1.0,0.6,1.0,0.000594,8.0,9.992007e-16,1.774646
4,CatBoost,openml__analcatdata_authorship__3549,0.976331,0.97979,1.263401,1.404955,0.936989,0.998824,0.692186,19.0,0.683698,19.0,0.933857,0.998823,0.041115,10.0,0.007473539,0.909328


In [45]:
# sanity check..
# result_df_dict["Accuracy"][result_df_dict["Accuracy"]["dataset_name"] == "openml__Amazon_employee_access__34539"]
# tuned_agg_df_with_default[(tuned_agg_df_with_default["dataset_name"] == "openml__Amazon_employee_access__34539") & tuned_agg_df_with_default["alg_name"].str.contains("CatBoost")]

non_tabpfn_agg_df_no_default[(non_tabpfn_agg_df_no_default["dataset_name"] == "openml__ada_agnostic__3896") & non_tabpfn_agg_df_no_default["alg_name"].str.contains("CatBoost")]
# fold_tuned_df[fold_tuned_df["dataset_fold_id"] == "openml__Amazon_employee_access__34539__fold_1"]

Unnamed: 0,alg_name,dataset_name,Accuracy__test_median,Accuracy__test_mean,time__train_median,time__train_mean,Accuracy__test_mean_min,Accuracy__test_mean_max,normalized_Accuracy__test_mean,Accuracy_rank_mean,normalized_F1__test_mean,F1_rank_mean,F1__test_mean_min,F1__test_mean_max,normalized_Log Loss__test_mean,Log Loss_rank_mean,Log Loss__test_mean_min,Log Loss__test_mean_max
18,CatBoost,openml__ada_agnostic__3896,0.855422,0.85423,0.40639,0.986107,0.791756,0.857521,0.94996,2.0,0.94996,2.0,0.791756,0.857521,0.000755,2.0,0.321328,0.469825


# Aggregate results

In [46]:
###### WITH default hparams treated as algs, and NO tabpfn

# best, worst, and average performance for each alg, over all datasets
for metric in metric_list:

    overall_ranks = non_tabpfn_agg_df_with_default.groupby("alg_name").agg(
        {
            f"{metric}_rank_mean": ["min", "max", "mean", "count"],
            f"normalized_{metric}__test_mean": "mean",
        }
    ).reset_index().sort_values([(f"{metric}_rank_mean", "mean")])

    # format min/max rank columns to be ints

    overall_ranks.loc[:, "count"] = overall_ranks.loc[:, (f"{metric}_rank_mean", "count")].astype(int)
    overall_ranks.drop(columns=(f"{metric}_rank_mean", "count"), inplace=True)

    # overall_ranks.loc[:, "alg_name"] = overall_ranks.loc[:, "alg_name"].apply(lambda x: "\rot{" + x + "}")
    overall_ranks.loc[:, (f"{metric}_rank_mean", "min")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "min")].astype(int)
    overall_ranks.loc[:, (f"{metric}_rank_mean", "max")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "max")].astype(int)

    overall_ranks.loc[:, (f"{metric}_rank_mean", "mean")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "mean")].round(2)
    
    overall_ranks.loc[:, (f"normalized_{metric}__test_mean", "mean")] = overall_ranks.loc[:,(f"normalized_{metric}__test_mean", "mean")].round(2)

    print(f"metric: {metric}")
    final_table = overall_ranks.set_index("alg_name")
    print(final_table)

    # save to csv
    final_table.to_csv(f"./results/rank_tables_with_untuned_{metric}.csv", index=True)

    # save to latex
    final_table.to_latex(f"./results/rank_tables_with_untuned_{metric}.tex", index=True, escape=False)


    print("\n")

metric: Accuracy
                           Accuracy_rank_mean             \
                                          min max   mean   
alg_name                                                   
CatBoost                                    1  36   8.20   
XGBoost                                     1  36   9.25   
CatBoost_default                            1  33  11.23   
XGBoost_default                             1  39  11.26   
rtdl_ResNet                                 1  37  11.29   
LightGBM                                    1  38  11.60   
SAINT                                       1  35  12.06   
NODE                                        1  37  12.47   
LightGBM_default                            1  38  12.52   
RandomForest                                1  37  13.48   
rtdl_FTTransformer                          1  31  13.52   
rtdl_ResNet_default                         1  41  13.98   
SVM                                         1  35  13.99   
NODE_default           

In [47]:
###### NO default hparams, and NO tabpfn

# remove tabpfn due to dataset restrictions - this method has its own table

# best, worst, and average performance for each alg, over all datasets
for metric in metric_list:

    overall_ranks = non_tabpfn_agg_df_no_default.groupby("alg_name").agg(
        {
            f"{metric}_rank_mean": ["min", "max", "mean", "count"],
            f"normalized_{metric}__test_mean": "mean",
        }
    ).reset_index().sort_values([(f"{metric}_rank_mean", "mean")])

    # format min/max rank columns to be ints

    overall_ranks.loc[:, "count"] = overall_ranks.loc[:, (f"{metric}_rank_mean", "count")].astype(int)
    overall_ranks.drop(columns=(f"{metric}_rank_mean", "count"), inplace=True)

    # overall_ranks.loc[:, "alg_name"] = overall_ranks.loc[:, "alg_name"].apply(lambda x: "\rot{" + x + "}")
    overall_ranks.loc[:, (f"{metric}_rank_mean", "min")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "min")].astype(int)
    overall_ranks.loc[:, (f"{metric}_rank_mean", "max")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "max")].astype(int)

    overall_ranks.loc[:, (f"{metric}_rank_mean", "mean")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "mean")].round(2)
    
    overall_ranks.loc[:, (f"normalized_{metric}__test_mean", "mean")] = overall_ranks.loc[:,(f"normalized_{metric}__test_mean", "mean")].round(2)

    print(f"metric: {metric}")
    final_table = overall_ranks.set_index("alg_name")
    print(final_table)

    # save to csv
    final_table.to_csv(f"./results/rank_tables_{metric}.csv", index=True)

    # save to latex
    final_table.to_latex(f"./results/rank_tables_{metric}.tex", index=True, escape=False)


    print("\n")

metric: Accuracy
                   Accuracy_rank_mean             \
                                  min max   mean   
alg_name                                           
CatBoost                            1  18   5.21   
XGBoost                             1  19   5.61   
rtdl_ResNet                         1  20   6.85   
LightGBM                            1  20   6.96   
SAINT                               1  19   7.15   
NODE                                1  20   7.48   
RandomForest                        1  19   8.09   
rtdl_FTTransformer                  1  17   8.10   
SVM                                 1  19   8.31   
DANet                               1  20   8.65   
rtdl_MLP                            1  19   9.57   
DeepFM                              1  21  10.69   
TabNet                              1  21  11.04   
MLP                                 1  20  11.37   
DecisionTree                        1  21  11.41   
TabTransformer                      1  21  11.4

In [49]:
###### NO default, WITH tabpfn

# best, worst, and average performance for each alg, over all datasets
for metric in metric_list:

    overall_ranks = tabpfn_agg_df_no_default.groupby("alg_name").agg(
        {
            f"{metric}_rank_mean": ["min", "max", "mean", "count"],
            f"normalized_{metric}__test_mean": "mean",
        }
    ).reset_index().sort_values([(f"{metric}_rank_mean", "mean")])

    # format min/max rank columns to be ints

    overall_ranks.loc[:, "count"] = overall_ranks.loc[:, (f"{metric}_rank_mean", "count")].astype(int)
    overall_ranks.drop(columns=(f"{metric}_rank_mean", "count"), inplace=True)

    # overall_ranks.loc[:, "alg_name"] = overall_ranks.loc[:, "alg_name"].apply(lambda x: "\rot{" + x + "}")
    overall_ranks.loc[:, (f"{metric}_rank_mean", "min")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "min")].astype(int)
    overall_ranks.loc[:, (f"{metric}_rank_mean", "max")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "max")].astype(int)

    overall_ranks.loc[:, (f"{metric}_rank_mean", "mean")] = overall_ranks.loc[:, (f"{metric}_rank_mean", "mean")].round(2)
    
    overall_ranks.loc[:, (f"normalized_{metric}__test_mean", "mean")] = overall_ranks.loc[:,(f"normalized_{metric}__test_mean", "mean")].round(2)

    print(f"metric: {metric}")
    final_table = overall_ranks.set_index("alg_name")
    print(final_table)

    # save to csv
    final_table.to_csv(f"./results/rank_tables_{metric}-tabpfn.csv", index=True)

    # save to latex
    final_table.to_latex(f"./results/rank_tables_{metric}-tabpfn.tex", index=True, escape=False)


    print("\n")

metric: Accuracy
                   Accuracy_rank_mean             \
                                  min max   mean   
alg_name                                           
TabPFNModel                         1  20   4.92   
CatBoost                            1  19   5.68   
rtdl_ResNet                         1  21   7.48   
SAINT                               1  20   7.81   
rtdl_FTTransformer                  1  18   8.32   
RandomForest                        1  19   8.37   
NODE                                1  21   8.48   
XGBoost                             1  20   8.86   
DeepFM                              1  22   9.43   
rtdl_MLP                            1  19  10.10   
SVM                                 1  20  10.24   
LinearModel                         1  21  10.65   
LightGBM                            1  21  10.89   
DANet                               1  21  11.40   
MLP                                 1  20  12.19   
TabTransformer                      1  22  12.3

## UNDER CONSTRUCTION: spaghetti plot - relative performance over different datasets.

In [59]:
tabpfn_agg_df_no_default[tabpfn_agg_df_no_default["alg_name"] == "TabPFNModel"])

Unnamed: 0,alg_name,dataset_name,Accuracy__test_median,Accuracy__test_mean,time__train_median,time__train_mean,Accuracy__test_mean_min,Accuracy__test_mean_max,normalized_Accuracy__test_mean,Accuracy_rank_mean,normalized_F1__test_mean,F1_rank_mean,F1__test_mean_min,F1__test_mean_max,normalized_Log Loss__test_mean,Log Loss_rank_mean,Log Loss__test_mean_min,Log Loss__test_mean_max
886,TabPFNModel,openml__Australian__146818,0.855072,0.866667,0.000661,0.00067,0.711594,0.872464,0.963964,2.0,0.963964,2.0,0.711594,0.872464,0.080464,3.0,0.3026771,0.75592
887,TabPFNModel,openml__LED-display-domain-7digit__125921,0.68,0.698,0.000374,0.000405,0.698,0.736,0.0,19.0,0.0,19.0,0.690748,0.731609,0.131496,11.0,0.8274095,2.539521
888,TabPFNModel,openml__MiceProtein__146800,1.0,0.998148,0.000434,0.00047,0.661111,0.998148,1.0,1.0,1.0,1.0,0.614618,0.998148,0.012288,3.0,0.01198685,1.380437
889,TabPFNModel,openml__acute-inflammations__10089,1.0,1.0,0.000584,0.000539,0.6,1.0,1.0,1.0,1.0,1.0,0.6,1.0,0.000312,6.0,9.992007e-16,1.774646
890,TabPFNModel,openml__analcatdata_authorship__3549,1.0,0.998824,0.000381,0.000415,0.936989,0.998824,1.0,1.0,1.0,1.0,0.933857,0.998823,0.002487,2.0,0.007473539,0.909328
891,TabPFNModel,openml__analcatdata_boxing1__3540,0.666667,0.675,0.000364,0.000402,0.575,0.841667,0.375,10.0,0.375,10.0,0.575,0.841667,0.069648,9.0,0.4475764,3.317489
892,TabPFNModel,openml__analcatdata_chlamydia__3739,0.9,0.89,0.000387,0.000425,0.74,0.94,0.75,7.0,0.75,7.0,0.74,0.94,0.003608,3.0,0.1642215,8.304631
893,TabPFNModel,openml__analcatdata_dmft__3560,0.207595,0.207009,0.000501,0.000529,0.171915,0.243354,0.4912514,13.0,0.0,19.0,0.134696,0.224598,0.022922,4.0,1.759146,2.0844
894,TabPFNModel,openml__anneal__2867,0.983333,0.97769,0.001344,0.001563,0.761698,0.99221,0.9370126,8.0,0.9615027,7.0,0.658539,0.990714,0.004914,4.0,0.03166911,8.094104
895,TabPFNModel,openml__autos__9,0.780952,0.76619,0.000537,0.000553,0.271667,0.815238,0.9097678,5.0,0.9002671,5.0,0.239959,0.79982,0.003839,3.0,0.6015017,18.28299


In [18]:
# which datasets to use?

result_df_dict["F1"][(result_df_dict["F1"]["alg_name"] == "CatBoost") & (result_df_dict["F1"]["metric_rank_mean"] < 2)]

KeyError: 'metric_rank_mean'

In [23]:
# openml__diabetes__37 <-- lm does well
# openml__isolet__3481
# openml__haberman__42
# openml__robert__168332

# openml__soybean__41 <-- rf does well
# openml__vowel__3022
# openml__guillermo__168337

# openml__cmc__23 <-- mlp does well
# openml__CIFAR_10__167124
# openml__Fashion-MNIST__146825
# openml__Internet-Advertisements__167125	
# openml__dilbert__168909

# openml__Australian__146818 <-- catboost
# openml__APSFailure__168868
# openml__wdbc__9946
# openml__pc1__3918
# openml__eucalyptus__2079

In [24]:
plot_datasets = [
    "openml__diabetes__37",  # <-- lm does well
    "openml__isolet__3481",
    "openml__haberman__42",
    # "openml__robert__168332", # not enough successful algs
    "openml__soybean__41", #  <-- rf does well
    "openml__vowel__3022",
    # "openml__guillermo__168337", # not enough successful algs
    "openml__cmc__23", # <-- mlp does well
    # "openml__CIFAR_10__167124",  # not enough successful algs
    # "openml__Fashion-MNIST__146825",  # not enough successful algs
    "openml__Internet-Advertisements__167125",	
    "openml__dilbert__168909",
    "openml__Australian__146818",  #<-- catboost
    "openml__APSFailure__168868",
    "openml__wdbc__9946",
    "openml__pc1__3918",
    "openml__eucalyptus__2079",
]

# names to show on the plot
plot_dataset_names = [name[len("openml__"):].split("_")[0] for name in plot_datasets]

In [25]:
# number of results for each dataset
num_alg_per_dataset = result_df_dict["F1"].groupby("dataset_name")["alg_name"].count()
num_alg_per_dataset[num_alg_per_dataset < 10].sort_values()

dataset_name
openml__Devnagari-Script__167121                          5
openml__covertype__7593                                   5
openml__helena__168329                                    5
openml__CIFAR_10__167124                                  6
openml__albert__189356                                    6
openml__guillermo__168337                                 6
openml__Fashion-MNIST__146825                             7
openml__riccardo__168338                                  7
openml__robert__168332                                    7
openml__airlines__189354                                  8
openml__mnist_784__3573                                   8
openml__higgs__146606                                     9
openml__jungle_chess_2pcs_raw_endgame_complete__167119    9
openml__numerai28.6__167120                               9
openml__skin-segmentation__9965                           9
openml__sylvine__168912                                   9
Name: alg_name, dtype: int6

In [26]:
plot_algs = agg_tuned_alg_perf["alg_name"].unique()

# gather data for the spaghetti plot
data = dict()
for i_metric, metric_name in enumerate(metric_list):
    data[metric_name] = dict()
    for alg in plot_algs:
            data[metric_name][alg] = []
            for dataset in plot_datasets:
                vals = result_df_dict[metric_name].loc[(result_df_dict[metric_name]["alg_name"] == alg) & (result_df_dict[metric_name]["dataset_name"] == dataset), f"normalized_{metric_name}__test_mean"].values
                if len(vals) != 1:
                    print(f"there's an issue with {alg}-{dataset}-{metric_name}")
                    print(vals)
                    val = None
                else:
                    val = vals[0]
                data[metric_name][alg].append(val)
        

there's an issue with CatBoost-openml__isolet__3481-Accuracy
[]
there's an issue with LightGBM-openml__dilbert__168909-Accuracy
[]
there's an issue with RandomForest-openml__haberman__42-Accuracy
[]
there's an issue with SVM-openml__soybean__41-Accuracy
[]
there's an issue with CatBoost-openml__isolet__3481-F1
[]
there's an issue with LightGBM-openml__dilbert__168909-F1
[]
there's an issue with RandomForest-openml__haberman__42-F1
[]
there's an issue with SVM-openml__soybean__41-F1
[]
there's an issue with CatBoost-openml__isolet__3481-Log Loss
[]
there's an issue with LightGBM-openml__dilbert__168909-Log Loss
[]
there's an issue with RandomForest-openml__haberman__42-Log Loss
[]
there's an issue with SVM-openml__soybean__41-Log Loss
[]


In [1]:
### plotting kwargs

plot_alg_map = {
    "XGBoost": {
        "name": "XGBoost",
        "plt-kwargs": {"marker":"x", "color":"r", "linestyle":"--"}
    },
    "CatBoost": {
        "name": "CatBoost",
        "plt-kwargs": {"marker":"+", "color":"r", "linestyle":"--"}
    },
    "LightGBM": {
        "name": "LightGBM",
        "plt-kwargs": {"marker":"d", "color":"r", "linestyle":"--"}
    },
    "SVM": {
        "name": "SVM",
        "plt-kwargs": {"marker":"v", "color":"black", "linestyle":"-"}
    },
    "KNN": {
        "name": "KNN",
        "plt-kwargs": {"marker":"^", "color":"black", "linestyle":"-"}
    },
    "DecisionTree": {
        "name": "DecisionTree",
        "plt-kwargs": {"marker":">", "color":"black", "linestyle":"-"}
    },
    "RandomForest": {
        "name": "RandomForest",
        "plt-kwargs": {"marker":"P", "color":"black", "linestyle":"-"}
    },
    "LinearModel": {
        "name": "LinearModel",
        "plt-kwargs": {"marker":"<", "color":"black", "linestyle":"-"}
    },
    "TabNet": {
        "name": "TabNet",
        "plt-kwargs": {"marker":"X", "color":"b", "linestyle":":"}
    },
    "MLP": {
        "name": "MLP",
        "plt-kwargs": {"marker":"o", "color":"b", "linestyle":":"}
    },
    "VIME": {
        "name": "VIME",
        "plt-kwargs": {"marker":"P", "color":"b", "linestyle":":"}
    },
}

plot_algs = plot_alg_map.keys()

In [2]:
import numpy as np
fig, ax = plt.subplots(len(metric_list), 1, sharex=True, figsize=(8, 5))

for i, metric in enumerate(metric_list):
    for alg in plot_algs:    
        ax[i].plot(data[metric][alg], label=alg, markersize=7, **plot_alg_map[alg]["plt-kwargs"])
    ax[i].set_ylabel(metric)

    ax[i].set_xticks(np.arange(len(plot_dataset_names)))
    ax[i].set_xticklabels(plot_dataset_names, rotation=-35, ha='left', rotation_mode='anchor')

plt.tight_layout()
plt.subplots_adjust(hspace=0.08)

plt.legend(loc="upper center", bbox_to_anchor=(0.5, 3.6), ncol=6, fontsize="small")
plt.savefig("./results/performance_spaghetti.pdf", bbox_inches='tight')
plt.show()


NameError: name 'plt' is not defined