# Create runtime tables

In [1]:
# read cleaned results
import pandas as pd
from pathlib import Path

fold_df_with_default = pd.read_csv(Path("./cleaned_results/tuned_fold_results_with_default.csv"))

# read metafeatures
metafeatures_df = pd.read_csv(Path("../../TabSurvey/metafeatures.csv"))

# make sure that the output folder exists
output_folder = Path("./runtime_tables")
output_folder.mkdir(exist_ok=True)    

In [2]:
# get the default hparams, and include TabPFN & LinearModel (which only have default params)
default_rows = fold_df_with_default.loc[(fold_df_with_default["alg_name"].str.contains("(default)") | fold_df_with_default["alg_name"].str.contains("TabPFNModel") | fold_df_with_default["alg_name"].str.contains("LinearModel")), :]

  default_rows = fold_df_with_default.loc[(fold_df_with_default["alg_name"].str.contains("(default)") | fold_df_with_default["alg_name"].str.contains("TabPFNModel") | fold_df_with_default["alg_name"].str.contains("LinearModel")), :]


In [3]:
# get the number of instances for each dataset, we will use these later
num_instances = metafeatures_df.loc[:, ["dataset_name", "f__pymfe.general.nr_inst"]]
num_instances.columns = ["dataset_fold_id", "num_inst"]

In [4]:
# merge in num instances
time_df = default_rows.merge(num_instances, on="dataset_fold_id", how="left")

# calculate runtime
time_col = "time__train"

time_per_inst_col = "train_per_1000_inst"

time_df.loc[:, time_per_inst_col] = 1000. * time_df[time_col] / time_df["num_inst"]

# average over all folds
time_df_agg = time_df.groupby(["dataset_name", "alg_name"]).agg({time_per_inst_col: "mean"}).reset_index()

time_df_agg.loc[:, f"{time_per_inst_col}_rank"] = time_df_agg.groupby(["dataset_name"])[time_per_inst_col].rank(method="min", ascending=True).values

# Calculate average runtime per samples, over all folds

- only include algs parameterized with default hparams
- also calculate normalized accuracy for comparing runtime with performance

In [5]:
# create a table of runtime rankings, sorted by avg. rank
# also show mean time-per-1000-instances

metric = time_per_inst_col

# overall_ranks = non_tabpfn_agg_df_no_default.groupby("alg_name").agg(
overall_ranks = time_df_agg.groupby("alg_name").agg(
    {
        f"{metric}_rank": ["min", "max", "mean", "count"],
        f"{metric}": "mean",
        # "Accuracy_rank": "mean",
    }
).reset_index().sort_values([(f"{metric}_rank", "mean")])

# format min/max rank columns to be ints, and floats to 2 decimal places

overall_ranks.loc[:, "count"] = overall_ranks.loc[:, (f"{metric}_rank", "count")].astype(int)
overall_ranks.drop(columns=(f"{metric}_rank", "count"), inplace=True)

overall_ranks.loc[:, (f"{metric}_rank", "min")] = overall_ranks.loc[:, (f"{metric}_rank", "min")].astype(int)
overall_ranks.loc[:, (f"{metric}_rank", "max")] = overall_ranks.loc[:, (f"{metric}_rank", "max")].astype(int)

overall_ranks.loc[:, (f"{metric}_rank", "mean")] = overall_ranks.loc[:, (f"{metric}_rank", "mean")].round(2)
overall_ranks.loc[:, (f"{metric}", "mean")] = overall_ranks.loc[:,(f"{metric}", "mean")].round(2)

print(f"metric: train-time")
final_table = overall_ranks.set_index("alg_name")
print(final_table)

# save to csv
final_table.to_csv(output_folder / "time_per_1000_rank_tables.csv", index=True)

# save to latex
final_table.to_latex(output_folder / "time_per_1000_rank_tables.tex", index=True, escape=False)


print("\n")

metric: train-time
                        train_per_1000_inst_rank             \
                                             min max   mean   
alg_name                                                      
KNN (default)                                  1   4   1.35   
DecisionTree (default)                         1   3   1.98   
LinearModel                                    1   4   2.84   
RandomForest (default)                         4   8   5.30   
SVM (default)                                  3  17   5.72   
XGBoost (default)                              3   9   5.92   
LightGBM (default)                             4  11   6.11   
CatBoost (default)                             5  18   8.22   
MLP-rtdl (default)                             6  16   9.74   
ResNet (default)                               8  15  10.65   
MLP (default)                                  7  16  11.29   
VIME (default)                                 8  16  12.02   
STG (default)                       

  final_table.to_latex(output_folder / "time_per_1000_rank_tables.tex", index=True, escape=False)
