# Run an initial analysis of results and produce aggregated results files

In [5]:
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from IPython.display import display_html

pd.options.display.max_rows = 400
from metadata_utils import get_metadata, get_tuned_alg_perf, process_metafeatures, compute_feature_corrs

In [6]:
dataset_version = ""

# For choosing metafeatures
filter_families = [
    'general',
    'statistical',
    'info-theory'
]

metadataset_df, metafeatures_df = get_metadata(dataset_version)

metafeatures_processed = process_metafeatures(metafeatures_df, filter_families=filter_families)
metafeatures_df = metafeatures_processed



  metafeatures_processed = metafeatures_df.fillna(metafeatures_df.median())


In [7]:
# keep only binary and classification datasets
metadataset_df["target_type"].value_counts()

metadataset_df = metadataset_df.loc[metadataset_df["target_type"].isin(["binary", "classification"]), :]

In [8]:
# check that all dataset-alg pairs have results for all 10 folds
print(metadataset_df.groupby(["alg_name", "dataset_name"]).agg({"dataset_fold_id": lambda x: len(set(x))}))

                                                   dataset_fold_id
alg_name    dataset_name                                          
CatBoost    openml__APSFailure__168868                          10
            openml__Amazon_employee_access__34539               10
            openml__Australian__146818                          10
            openml__Bioresponse__9910                           10
            openml__Census-Income__168340                       10
...                                                            ...
rtdl_ResNet openml__sick__3021                                  10
            openml__soybean__41                                 10
            openml__splice__45                                  10
            openml__tic-tac-toe__49                             10
            openml__vowel__3022                                 10

[2624 rows x 1 columns]


## Remove datasets with few results

In [9]:
print(f"for each dataset: number of algs with results")
alg_counts = metadataset_df.groupby("dataset_name")["alg_name"].agg(lambda x: len(set(x))).sort_values()
print(alg_counts)

for each dataset: number of algs with results
dataset_name
openml__poker-hand__9890                                   4
openml__covertype__7593                                    6
openml__Devnagari-Script__167121                           6
openml__albert__189356                                     7
openml__helena__168329                                     7
openml__CIFAR_10__167124                                   8
openml__walking-activity__9945                             8
openml__Fashion-MNIST__146825                              9
openml__Census-Income__168340                             10
openml__guillermo__168337                                 10
openml__jungle_chess_2pcs_raw_endgame_complete__167119    10
openml__ldpa__9974                                        10
openml__airlines__189354                                  10
openml__riccardo__168338                                  10
openml__robert__168332                                    10
openml__mnist_784__3573   

In [10]:
keep_datasets = list(alg_counts[alg_counts >= 10].index)
drop_datasets = alg_counts[alg_counts < 10]

print(f"dropping {len(drop_datasets)} datasets:")
print(drop_datasets)

dropping 8 datasets:
dataset_name
openml__poker-hand__9890            4
openml__covertype__7593             6
openml__Devnagari-Script__167121    6
openml__albert__189356              7
openml__helena__168329              7
openml__CIFAR_10__167124            8
openml__walking-activity__9945      8
openml__Fashion-MNIST__146825       9
Name: alg_name, dtype: int64


In [11]:
print("for each alg: number of datasets with results")
dataset_counts = metadataset_df.groupby("alg_name")["dataset_name"].agg(lambda x: len(set(x))).sort_values()
print(dataset_counts)

for each alg: number of datasets with results
alg_name
rtdl_FTTransformer     15
rtdl_MLP               28
rtdl_ResNet            31
SAINT                  45
DANet                  76
NAM                    78
DeepFM                 89
TabTransformer        124
NODE                  138
SVM                   143
VIME                  163
STG                   164
CatBoost              165
LightGBM              165
KNN                   167
LinearModel           168
TabNet                168
RandomForest          173
XGBoost               174
DecisionTree          175
MLP                   175
Name: dataset_name, dtype: int64


We will keep all algs, regardless of how many datasets they have results for.

In [12]:
metadataset_df = metadataset_df.loc[metadataset_df["dataset_name"].isin(keep_datasets), :]

# Get tuned algorithms for a given metric

Report the average & median test performance, over all folds. Note that each alg is tuned for each fold separately.

In [13]:
metric_list = [
    "Accuracy",
    "F1",
    "Log Loss",
]

obj_type_list = [
    "maximize",
    "maximize",
    "minimize",
]
result_df_dict = {}



In [14]:
tuned_result_dfs = {}
for i, (metric, objective_type) in enumerate(zip(metric_list, obj_type_list)):

    test_metric_col = metric + "__test"

    tuned_alg_perf = get_tuned_alg_perf(metadataset_df, metric=metric)
    # NOTE: this "tunes" each algorithm for each training fold separately. so each of the 10 folds might use different hparams.
    tuned_result_dfs[metric] = tuned_alg_perf

    # # make sure there are at most 10 rows for each alg-dataset pair (one row per split)
    # print("max number of rows for each tuned alg-dataset pair:")
    # print(tuned_alg_perf.groupby(["alg_name", "dataset_name"])["dataset_name"].count().max())

    if i == 0:
        agg_dict = {
            test_metric_col: ["median", "mean"],
            "time__train": ["median", "mean"],
            "dataset_name": ["count"],
        }
    else:
        agg_dict = {
            test_metric_col: ["median", "mean"],
        }

    # aggregate over folds: take the mean & median performance over each fold
    agg_tuned_alg_perf = tuned_alg_perf.groupby(["alg_name", "dataset_name"]).agg(agg_dict).reset_index()

    # rename the multiindex cols
    new_cols = []
    for c in agg_tuned_alg_perf.columns:
        if c[1] == "":
            new_cols.append(c[0])
        else:
            new_cols.append("_".join(c))

    agg_tuned_alg_perf.columns = new_cols


    # define the target metric column, we will use this value for all plots
    result_col = test_metric_col + "_mean"

    # for each dataset, find the min and max metrics over all tuned algs
    overall_bounds = agg_tuned_alg_perf.groupby("dataset_name").agg({result_col: ["min", "max"]}).reset_index()

    # rename the multiindex cols
    new_cols = []
    for c in overall_bounds.columns:
        if c[1] == "":
            new_cols.append(c[0])
        else:
            new_cols.append("_".join(c))

    overall_bounds.columns = new_cols

    agg_tuned_alg_perf = agg_tuned_alg_perf.merge(overall_bounds, on="dataset_name", how="left")

    # add normalized metric
    agg_tuned_alg_perf.loc[:, "normalized_" + result_col] = (agg_tuned_alg_perf[result_col] - agg_tuned_alg_perf[result_col + "_min"]) / (agg_tuned_alg_perf[result_col + "_max"] - agg_tuned_alg_perf[result_col + "_min"])

    # rank all algs for each dataset
    ascending = False if objective_type == "maximize" else True
    
    # choose the mean or median
    for agg_method in ["mean", "median"]:
        agg_tuned_alg_perf.loc[:, f"{metric}_rank" + "_" + agg_method] = agg_tuned_alg_perf.groupby(["dataset_name"])[test_metric_col + "_" + agg_method].rank(method="min", ascending=ascending).values

    if i == 0:
        tuned_agg_df = agg_tuned_alg_perf.copy()
    else:
        tuned_agg_df = tuned_agg_df.merge(agg_tuned_alg_perf, on=["alg_name", "dataset_name"])

    result_df_dict[metric] = agg_tuned_alg_perf.copy()

In [15]:
# sanity check..
# result_df_dict["Accuracy"][result_df_dict["Accuracy"]["dataset_name"] == "openml__Amazon_employee_access__34539"]
tuned_agg_df[tuned_agg_df["dataset_name"] == "openml__Amazon_employee_access__34539"]

Unnamed: 0,alg_name,dataset_name,Accuracy__test_median,Accuracy__test_mean,time__train_median,time__train_mean,dataset_name_count,Accuracy__test_mean_min,Accuracy__test_mean_max,normalized_Accuracy__test_mean,...,normalized_F1__test_mean,F1_rank_mean,F1_rank_median,Log Loss__test_median,Log Loss__test_mean,Log Loss__test_mean_min,Log Loss__test_mean_max,normalized_Log Loss__test_mean,Log Loss_rank_mean,Log Loss_rank_median
1,CatBoost,openml__Amazon_employee_access__34539,0.946903,0.947359,1.708439,1.729567,10,0.930422,0.95157,0.800861,...,0.800861,4.0,3.0,0.156969,0.156347,0.155615,0.364447,0.003507,2.0,2.0
161,DANet,openml__Amazon_employee_access__34539,0.94202,0.942171,396.439996,607.382934,10,0.930422,0.95157,0.555551,...,0.555551,10.0,10.0,0.218345,0.212911,0.155615,0.364447,0.274366,9.0,11.0
238,DecisionTree,openml__Amazon_employee_access__34539,0.942316,0.942232,0.284152,0.344668,10,0.930422,0.95157,0.558436,...,0.558436,9.0,8.0,0.215871,0.219248,0.155615,0.364447,0.30471,10.0,9.0
406,DeepFM,openml__Amazon_employee_access__34539,0.929966,0.930422,42.264218,42.934775,10,0.930422,0.95157,0.0,...,0.0,17.0,17.0,0.332668,0.331231,0.155615,0.364447,0.840944,16.0,16.0
495,KNN,openml__Amazon_employee_access__34539,0.94202,0.941957,0.140002,0.135941,10,0.930422,0.95157,0.545448,...,0.545448,16.0,10.0,0.363927,0.364447,0.155615,0.364447,1.0,17.0,17.0
658,LightGBM,openml__Amazon_employee_access__34539,0.951022,0.95157,4.414482,3.945315,10,0.930422,0.95157,1.0,...,1.0,1.0,1.0,0.156657,0.155615,0.155615,0.364447,0.0,1.0,1.0
821,LinearModel,openml__Amazon_employee_access__34539,0.94202,0.94211,0.154202,0.170204,10,0.930422,0.95157,0.552664,...,0.552664,11.0,10.0,0.235868,0.235038,0.155615,0.364447,0.380322,15.0,15.0
986,MLP,openml__Amazon_employee_access__34539,0.94202,0.942049,64.32021,63.266802,10,0.930422,0.95157,0.549778,...,0.549778,15.0,10.0,0.2178,0.220588,0.155615,0.364447,0.311127,12.0,10.0
1152,NAM,openml__Amazon_employee_access__34539,0.94202,0.94211,264.750448,261.523752,10,0.930422,0.95157,0.552664,...,0.552664,11.0,10.0,0.222842,0.224779,0.155615,0.364447,0.331194,14.0,14.0
1369,RandomForest,openml__Amazon_employee_access__34539,0.943851,0.944002,2.977825,2.553213,10,0.930422,0.95157,0.642128,...,0.642128,7.0,6.0,0.172221,0.171875,0.155615,0.364447,0.077862,7.0,7.0


In [16]:
# write tuned df to file
tuned_agg_df.to_csv("./tuned_aggregated_results.csv")

## Difference between best neural and best non-neural method

In [17]:
# now tune by algorithm type. first define the type as "neural" or "non-neural"
neural_algs = [
    "MLP",
    "TabNet",
    "VIME",
    "TabTransformer",
    "NODE",
    "STG",
    "NAM",
    "DeepFM",
    "SAINT",
    "DANet",
    "rtdl_MLP",
    "rtdl_ResNet",
    "rtdl_FTTransformer",
]

metadataset_df.loc[:, "alg_type"] = "non-neural"
metadataset_df.loc[metadataset_df["alg_name"].isin(neural_algs), "alg_type"] = "neural"

tuned_df = get_tuned_alg_perf(metadataset_df, metric=metric, group_col="alg_type")



In [18]:
# for each dataset fold, get difference between tuned neural and non-neural method (neural - non-neural)
neural_non_neural_comparison = pd.pivot(tuned_df, index="dataset_fold_id", columns=["alg_type"], values=["Accuracy__test", "F1__test", "MSE__test", "Log Loss__test", "alg_name", "time__train", "time__test"])
print(neural_non_neural_comparison.head())

                                   Accuracy__test             F1__test  \
alg_type                                   neural non-neural    neural   
dataset_fold_id                                                          
openml__APSFailure__168868__fold_0       0.992763   0.995263  0.992763   
openml__APSFailure__168868__fold_1       0.988684   0.992237  0.988684   
openml__APSFailure__168868__fold_2       0.990395   0.993947  0.990395   
openml__APSFailure__168868__fold_3       0.992368   0.995526  0.992368   
openml__APSFailure__168868__fold_4       0.991184   0.995789  0.991184   

                                              MSE__test             \
alg_type                           non-neural    neural non-neural   
dataset_fold_id                                                      
openml__APSFailure__168868__fold_0   0.995263       NaN        NaN   
openml__APSFailure__168868__fold_1   0.992237       NaN        NaN   
openml__APSFailure__168868__fold_2   0.993947       NaN  

In [24]:
# save the differences between neural and non-neural algs

# first rename the multiindex cols
new_cols = []
for c in overall_bounds.columns:
    if c[1] == "":
        new_cols.append(c[0])
    else:
        new_cols.append("_".join(c))

neural_non_neural_comparison.to_csv("./neural_non_neural_comparison.csv")

In [22]:
neural_non_neural_comparison.reset_index()

Unnamed: 0_level_0,dataset_fold_id,Accuracy__test,Accuracy__test,F1__test,F1__test,MSE__test,MSE__test,Log Loss__test,Log Loss__test,alg_name,alg_name,time__train,time__train,time__test,time__test
alg_type,Unnamed: 1_level_1,neural,non-neural,neural,non-neural,neural,non-neural,neural,non-neural,neural,non-neural,neural,non-neural,neural,non-neural
0,openml__APSFailure__168868__fold_0,0.992763,0.995263,0.992763,0.995263,,,0.019384,0.01648,TabTransformer,LightGBM,156.269556,25.718933,0.123314,0.100582
1,openml__APSFailure__168868__fold_1,0.988684,0.992237,0.988684,0.992237,,,0.030806,0.022713,TabTransformer,XGBoost,336.442136,0.940423,0.112383,0.056885
2,openml__APSFailure__168868__fold_2,0.990395,0.993947,0.990395,0.993947,,,0.026238,0.020097,TabTransformer,XGBoost,380.952997,1.18313,0.117252,0.066141
3,openml__APSFailure__168868__fold_3,0.992368,0.995526,0.992368,0.995526,,,0.024933,0.018053,TabTransformer,XGBoost,381.007164,1.730255,0.118603,0.082695
4,openml__APSFailure__168868__fold_4,0.991184,0.995789,0.991184,0.995789,,,0.030389,0.013555,TabTransformer,XGBoost,178.289103,2.699572,0.142411,0.085889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1675,openml__yeast__145793__fold_5,0.653543,0.669291,0.651901,0.672491,,,0.85606,0.772932,TabNet,LightGBM,6.810951,0.212742,0.005667,0.000287
1676,openml__yeast__145793__fold_6,0.614173,0.669291,0.613821,0.669725,,,1.062902,0.822254,TabNet,CatBoost,19.589545,0.914298,0.019241,0.002471
1677,openml__yeast__145793__fold_7,0.637795,0.685039,0.632844,0.67822,,,0.983789,0.854731,STG,CatBoost,15.905001,0.625206,0.001296,0.000693
1678,openml__yeast__145793__fold_8,0.622047,0.677165,0.620988,0.672749,,,0.967557,0.879122,TabNet,LightGBM,17.10107,0.611468,0.020197,0.005634


# Aggregate results

In [44]:
# best, worst, and average performance for each alg, over all datasets
for metric in metric_list:

    overall_ranks = tuned_agg_df.groupby("alg_name").agg(
        {
            f"{metric}_rank_mean": ["min", "max", "mean", "count"],
        }
    ).reset_index().sort_values([(f"{metric}_rank_mean", "mean")])
    print(f"metric: {metric}")
    print(overall_ranks)

    # save to csv
    overall_ranks.to_csv(f"./results/rank_tables_{metric}.csv", index=False)

    print("\n")

metric: Accuracy
              alg_name Accuracy_rank_mean                       
                                      min   max       mean count
0             CatBoost                1.0  15.0   4.273292   161
17             XGBoost                1.0  17.0   4.601190   168
5             LightGBM                1.0  19.0   5.539877   163
9                 NODE                1.0  16.0   6.065217   138
10        RandomForest                1.0  18.0   6.574850   167
1                DANet                1.0  20.0   6.618421    76
20         rtdl_ResNet                1.0  18.0   6.655172    29
18  rtdl_FTTransformer                2.0  12.0   6.733333    15
13                 SVM                1.0  16.0   6.762238   143
11               SAINT                1.0  17.0   7.288889    45
3               DeepFM                1.0  20.0   8.752809    89
19            rtdl_MLP                3.0  17.0   9.000000    28
14              TabNet                1.0  21.0   9.018293   164
7       

## spaghetti plot - relative performance over different datasets.

In [22]:
# which datasets to use?

result_df_dict["F1"][(result_df_dict["F1"]["alg_name"] == "CatBoost") & (result_df_dict["F1"]["metric_rank_mean"] < 2)]

Unnamed: 0,alg_name,dataset_name,F1__test_median,F1__test_mean,time__train_median,time__train_mean,F1__test_mean_min,F1__test_mean_max,normalized_F1__test_mean,metric_rank_mean,metric_rank_median
2,CatBoost,openml__Australian__146818,0.869565,0.872464,1.34765,1.393643,0.711594,0.872464,1.0,1.0,1.0
13,CatBoost,openml__albert__189356,0.703379,0.704762,33.092527,35.15392,0.56336,0.704762,1.0,1.0,1.0
19,CatBoost,openml__bank-marketing__9899,0.890609,0.896706,1.097164,1.289392,0.883434,0.896706,1.0,1.0,4.0
23,CatBoost,openml__breast-cancer__145799,0.75,0.765764,0.860736,0.948556,0.671429,0.765764,1.0,1.0,2.0
24,CatBoost,openml__breast-w__15,0.971429,0.971366,0.67026,0.687874,0.951346,0.971366,1.0,1.0,1.0
27,CatBoost,openml__churn__167141,0.956,0.958,2.666251,2.517326,0.8596,0.958,1.0,1.0,2.0
32,CatBoost,openml__credit-approval__29,0.884058,0.875362,1.388361,1.51731,0.681159,0.875362,1.0,1.0,1.0
38,CatBoost,openml__dresses-sales__125920,0.63,0.62,0.77495,1.060216,0.542,0.62,1.0,1.0,1.0
42,CatBoost,openml__eucalyptus__2079,0.666711,0.660875,1.438742,3.717244,0.359997,0.660875,1.0,1.0,1.0
53,CatBoost,openml__jasmine__168911,0.817423,0.816006,1.930358,2.099163,0.780486,0.816006,1.0,1.0,1.0


In [23]:
# openml__diabetes__37 <-- lm does well
# openml__isolet__3481
# openml__haberman__42
# openml__robert__168332

# openml__soybean__41 <-- rf does well
# openml__vowel__3022
# openml__guillermo__168337

# openml__cmc__23 <-- mlp does well
# openml__CIFAR_10__167124
# openml__Fashion-MNIST__146825
# openml__Internet-Advertisements__167125	
# openml__dilbert__168909

# openml__Australian__146818 <-- catboost
# openml__APSFailure__168868
# openml__wdbc__9946
# openml__pc1__3918
# openml__eucalyptus__2079

In [24]:
plot_datasets = [
    "openml__diabetes__37",  # <-- lm does well
    "openml__isolet__3481",
    "openml__haberman__42",
    # "openml__robert__168332", # not enough successful algs
    "openml__soybean__41", #  <-- rf does well
    "openml__vowel__3022",
    # "openml__guillermo__168337", # not enough successful algs
    "openml__cmc__23", # <-- mlp does well
    # "openml__CIFAR_10__167124",  # not enough successful algs
    # "openml__Fashion-MNIST__146825",  # not enough successful algs
    "openml__Internet-Advertisements__167125",	
    "openml__dilbert__168909",
    "openml__Australian__146818",  #<-- catboost
    "openml__APSFailure__168868",
    "openml__wdbc__9946",
    "openml__pc1__3918",
    "openml__eucalyptus__2079",
]

# names to show on the plot
plot_dataset_names = [name[len("openml__"):].split("_")[0] for name in plot_datasets]

In [25]:
# number of results for each dataset
num_alg_per_dataset = result_df_dict["F1"].groupby("dataset_name")["alg_name"].count()
num_alg_per_dataset[num_alg_per_dataset < 10].sort_values()

dataset_name
openml__Devnagari-Script__167121                          5
openml__covertype__7593                                   5
openml__helena__168329                                    5
openml__CIFAR_10__167124                                  6
openml__albert__189356                                    6
openml__guillermo__168337                                 6
openml__Fashion-MNIST__146825                             7
openml__riccardo__168338                                  7
openml__robert__168332                                    7
openml__airlines__189354                                  8
openml__mnist_784__3573                                   8
openml__higgs__146606                                     9
openml__jungle_chess_2pcs_raw_endgame_complete__167119    9
openml__numerai28.6__167120                               9
openml__skin-segmentation__9965                           9
openml__sylvine__168912                                   9
Name: alg_name, dtype: int6

In [26]:
plot_algs = agg_tuned_alg_perf["alg_name"].unique()

# gather data for the spaghetti plot
data = dict()
for i_metric, metric_name in enumerate(metric_list):
    data[metric_name] = dict()
    for alg in plot_algs:
            data[metric_name][alg] = []
            for dataset in plot_datasets:
                vals = result_df_dict[metric_name].loc[(result_df_dict[metric_name]["alg_name"] == alg) & (result_df_dict[metric_name]["dataset_name"] == dataset), f"normalized_{metric_name}__test_mean"].values
                if len(vals) != 1:
                    print(f"there's an issue with {alg}-{dataset}-{metric_name}")
                    print(vals)
                    val = None
                else:
                    val = vals[0]
                data[metric_name][alg].append(val)
        

there's an issue with CatBoost-openml__isolet__3481-Accuracy
[]
there's an issue with LightGBM-openml__dilbert__168909-Accuracy
[]
there's an issue with RandomForest-openml__haberman__42-Accuracy
[]
there's an issue with SVM-openml__soybean__41-Accuracy
[]
there's an issue with CatBoost-openml__isolet__3481-F1
[]
there's an issue with LightGBM-openml__dilbert__168909-F1
[]
there's an issue with RandomForest-openml__haberman__42-F1
[]
there's an issue with SVM-openml__soybean__41-F1
[]
there's an issue with CatBoost-openml__isolet__3481-Log Loss
[]
there's an issue with LightGBM-openml__dilbert__168909-Log Loss
[]
there's an issue with RandomForest-openml__haberman__42-Log Loss
[]
there's an issue with SVM-openml__soybean__41-Log Loss
[]


In [1]:
### plotting kwargs

plot_alg_map = {
    "XGBoost": {
        "name": "XGBoost",
        "plt-kwargs": {"marker":"x", "color":"r", "linestyle":"--"}
    },
    "CatBoost": {
        "name": "CatBoost",
        "plt-kwargs": {"marker":"+", "color":"r", "linestyle":"--"}
    },
    "LightGBM": {
        "name": "LightGBM",
        "plt-kwargs": {"marker":"d", "color":"r", "linestyle":"--"}
    },
    "SVM": {
        "name": "SVM",
        "plt-kwargs": {"marker":"v", "color":"black", "linestyle":"-"}
    },
    "KNN": {
        "name": "KNN",
        "plt-kwargs": {"marker":"^", "color":"black", "linestyle":"-"}
    },
    "DecisionTree": {
        "name": "DecisionTree",
        "plt-kwargs": {"marker":">", "color":"black", "linestyle":"-"}
    },
    "RandomForest": {
        "name": "RandomForest",
        "plt-kwargs": {"marker":"P", "color":"black", "linestyle":"-"}
    },
    "LinearModel": {
        "name": "LinearModel",
        "plt-kwargs": {"marker":"<", "color":"black", "linestyle":"-"}
    },
    "TabNet": {
        "name": "TabNet",
        "plt-kwargs": {"marker":"X", "color":"b", "linestyle":":"}
    },
    "MLP": {
        "name": "MLP",
        "plt-kwargs": {"marker":"o", "color":"b", "linestyle":":"}
    },
    "VIME": {
        "name": "VIME",
        "plt-kwargs": {"marker":"P", "color":"b", "linestyle":":"}
    },
}

plot_algs = plot_alg_map.keys()

In [2]:
import numpy as np
fig, ax = plt.subplots(len(metric_list), 1, sharex=True, figsize=(8, 5))

for i, metric in enumerate(metric_list):
    for alg in plot_algs:    
        ax[i].plot(data[metric][alg], label=alg, markersize=7, **plot_alg_map[alg]["plt-kwargs"])
    ax[i].set_ylabel(metric)

    ax[i].set_xticks(np.arange(len(plot_dataset_names)))
    ax[i].set_xticklabels(plot_dataset_names, rotation=-35, ha='left', rotation_mode='anchor')

plt.tight_layout()
plt.subplots_adjust(hspace=0.08)

plt.legend(loc="upper center", bbox_to_anchor=(0.5, 3.6), ncol=6, fontsize="small")
plt.savefig("./results/performance_spaghetti.pdf", bbox_inches='tight')
plt.show()


NameError: name 'plt' is not defined