# Critical difference diagrams for average performance of each alg

**Notes:**
- since this uses a pairwise statistical test across algorithms, **we only use datasets where there is a result for every algorithm**
- here we compare the average performance over all folds of a dataset. ro there is commented-out code in the second cell 

In [1]:
from critical_difference import draw_cd_diagram
import pandas as pd
from pathlib import Path

# make sure that the output folder exists
output_folder = Path("./cd_plots")
output_folder.mkdir(exist_ok=True)    

## CD plot for accuracy - include all algs

In [2]:
metric_name = "Accuracy"

# prepare df for CD plot

# uncomment the three lines below to use averaged performance over each dataset, rather than each fold
# agg_metadataset_df = pd.read_csv("./cleaned_results/tuned_aggregated_results.csv")
# metric_col = "Accuracy__test_mean"
# cd_df = agg_metadataset_df.loc[:, ["dataset_name", "alg_name", metric_col]].copy()

# to run this test using average performance over each dataset (rather than each fold), comment out the following three lines and uncomment the three above
fold_metadataset_df = pd.read_csv("./cleaned_results/tuned_fold_results.csv")
metric_col = f"{metric_name}__test"
cd_df = fold_metadataset_df.loc[:, ["dataset_fold_id", "alg_name", metric_col]].copy()

cd_df.columns = ["dataset_name", "classifier_name", "accuracy"]

# drop all datasets where we don't have results for all methods
num_datasets = len(cd_df["dataset_name"].unique())
num_algs = len(cd_df["classifier_name"].unique())

num_algs_per_dataset = cd_df.groupby("dataset_name")["classifier_name"].apply(lambda x: len(set(x)))
incomplete_datasets = list(num_algs_per_dataset[num_algs_per_dataset < num_algs].index)

print(f"total number of datasets: {num_datasets}")
print(f"num. datasets with incomplete results: {len(incomplete_datasets)}")

cd_df = cd_df.loc[~cd_df["dataset_name"].isin(incomplete_datasets), :]

print(f"after removing incomplete datasets, num datasets remaining: {len(cd_df['dataset_name'].unique())}")

# make CD plot
draw_cd_diagram(cd_df, alpha=0.05, fig_filename=f"./cd_plots/{metric_col}.pdf", title=metric_name)

total number of datasets: 1590
num. datasets with incomplete results: 550
after removing incomplete datasets, num datasets remaining: 1040
unique classifiers: ['CatBoost' 'DANet' 'DecisionTree' 'FTTransformer' 'KNN' 'LightGBM'
 'LinearModel' 'MLP' 'MLP-rtdl' 'NODE' 'RandomForest' 'ResNet' 'SAINT'
 'STG' 'SVM' 'TabNet' 'VIME' 'XGBoost']
classifiers tested on nb_max_datasets: ['CatBoost', 'DANet', 'DecisionTree', 'FTTransformer', 'KNN', 'LightGBM', 'LinearModel', 'MLP', 'MLP-rtdl', 'NODE', 'RandomForest', 'ResNet', 'SAINT', 'STG', 'SVM', 'TabNet', 'VIME', 'XGBoost']
CatBoost         51.0
DANet            23.0
DecisionTree     12.0
FTTransformer    28.0
KNN              10.0
LightGBM         66.0
LinearModel      23.0
MLP               7.0
MLP-rtdl          4.0
NODE             22.0
RandomForest     27.0
ResNet           24.0
SAINT            36.0
STG              19.0
SVM              45.0
TabNet           30.0
VIME              6.0
XGBoost          65.0
dtype: float64
average ranks: VIM

## CD Plot for Log Loss - include all algs

In [4]:
metric_name = "Log Loss"

# prepare df for CD plot

# uncomment the three lines below to use averaged performance over each dataset, rather than each fold
# agg_metadataset_df = pd.read_csv("./cleaned_results/tuned_aggregated_results.csv")
# metric_col = "Accuracy__test_mean"
# cd_df = agg_metadataset_df.loc[:, ["dataset_name", "alg_name", metric_col]].copy()

# to run this test using average performance over each dataset (rather than each fold), comment out the following three lines and uncomment the three above
fold_metadataset_df = pd.read_csv("./cleaned_results/tuned_fold_results.csv")
metric_col = f"{metric_name}__test"
cd_df = fold_metadataset_df.loc[:, ["dataset_fold_id", "alg_name", metric_col]].copy()

# for log loss - use negative metric
cd_df.loc[:, metric_col] = -1.0 * cd_df[metric_col]

cd_df.columns = ["dataset_name", "classifier_name", "accuracy"]

# drop all datasets where we don't have results for all methods
num_datasets = len(cd_df["dataset_name"].unique())
num_algs = len(cd_df["classifier_name"].unique())

num_algs_per_dataset = cd_df.groupby("dataset_name")["classifier_name"].apply(lambda x: len(set(x)))
incomplete_datasets = list(num_algs_per_dataset[num_algs_per_dataset < num_algs].index)

print(f"total number of datasets: {num_datasets}")
print(f"num. datasets with incomplete results: {len(incomplete_datasets)}")

cd_df = cd_df.loc[~cd_df["dataset_name"].isin(incomplete_datasets), :]

print(f"after removing incomplete datasets, num datasets remaining: {len(cd_df['dataset_name'].unique())}")

# make CD plot
draw_cd_diagram(cd_df, alpha=0.05, fig_filename=f"./cd_plots/{metric_col}.pdf", title=metric_name)

total number of datasets: 1590
num. datasets with incomplete results: 550
after removing incomplete datasets, num datasets remaining: 1040
unique classifiers: ['CatBoost' 'DANet' 'DecisionTree' 'FTTransformer' 'KNN' 'LightGBM'
 'LinearModel' 'MLP' 'MLP-rtdl' 'NODE' 'RandomForest' 'ResNet' 'SAINT'
 'STG' 'SVM' 'TabNet' 'VIME' 'XGBoost']
classifiers tested on nb_max_datasets: ['CatBoost', 'DANet', 'DecisionTree', 'FTTransformer', 'KNN', 'LightGBM', 'LinearModel', 'MLP', 'MLP-rtdl', 'NODE', 'RandomForest', 'ResNet', 'SAINT', 'STG', 'SVM', 'TabNet', 'VIME', 'XGBoost']
CatBoost         127.0
DANet             36.0
DecisionTree     100.0
FTTransformer     64.0
KNN               14.0
LightGBM          78.0
LinearModel       41.0
MLP               14.0
MLP-rtdl          55.0
NODE              11.0
RandomForest      38.0
ResNet            76.0
SAINT            104.0
STG               24.0
SVM               65.0
TabNet            54.0
VIME               5.0
XGBoost          128.0
dtype: float64


## TBD: CD Plot comparing best alg in each class by log loss, using all datasets 

In [None]:
## CD Plot for Log Loss - include all algs
metric_name = "Log Loss"

# prepare df for CD plot

# uncomment the three lines below to use averaged performance over each dataset, rather than each fold
# agg_metadataset_df = pd.read_csv("./cleaned_results/tuned_aggregated_results.csv")
# metric_col = "Accuracy__test_mean"
# cd_df = agg_metadataset_df.loc[:, ["dataset_name", "alg_name", metric_col]].copy()

# to run this test using average performance over each dataset (rather than each fold), comment out the following three lines and uncomment the three above
fold_metadataset_df = pd.read_csv("./cleaned_results/tuned_fold_results.csv")
metric_col = f"{metric_name}__test"


# cd_df = fold_metadataset_df.loc[:, ["dataset_fold_id", "alg_name", metric_col]].copy()

# # for log loss - use negative metric
# cd_df.loc[:, metric_col] = -1.0 * cd_df[metric_col]

# cd_df.columns = ["dataset_name", "classifier_name", "accuracy"]

# # drop all datasets where we don't have results for all methods
# num_datasets = len(cd_df["dataset_name"].unique())
# num_algs = len(cd_df["classifier_name"].unique())

# num_algs_per_dataset = cd_df.groupby("dataset_name")["classifier_name"].apply(lambda x: len(set(x)))
# incomplete_datasets = list(num_algs_per_dataset[num_algs_per_dataset < num_algs].index)

# print(f"total number of datasets: {num_datasets}")
# print(f"num. datasets with incomplete results: {len(incomplete_datasets)}")

# cd_df = cd_df.loc[~cd_df["dataset_name"].isin(incomplete_datasets), :]

# print(f"after removing incomplete datasets, num datasets remaining: {len(cd_df['dataset_name'].unique())}")

# # make CD plot
# draw_cd_diagram(cd_df, alpha=0.05, fig_filename=f"./cd_plots/{metric_col}.pdf", title=metric_name)

# TODO: create plots for subsets of algs
# TODO: create plots comparing best perfomrance for each class of algs (trees, neural, baselines)