In [22]:
import pandas as pd
import numpy as np
import ast
from tabulate import tabulate
import json
import yaml
import matplotlib.pyplot as plt
import itertools
import dataframe_image as dfi
import random

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [23]:
filename = "FRAN"  ## insert the name of the file used in the runner.py
df = pd.read_csv(rf"../output/{filename}.csv")

In [28]:
# List of metric names
metric_names = [
    "lift",
    "f1",
    "mse",
    "accuracy",
    "recall",
    "precision",
    "roc_auc",
    "area_under_pr",
    "r2_score",
    "rmse",
]
dataset = df.copy()
# Create columns based on metric names
# Convert string representations to dictionaries
dataset["output_metrics"] = dataset["output_metrics"].apply(ast.literal_eval)
dataset["run_time"] = dataset["run_time"].apply(lambda x: np.round(x / 60, 2))
# Create columns based on metric names
for metric in metric_names:
    dataset[metric] = dataset["output_metrics"].apply(
        lambda x: x.get(metric) if isinstance(x, dict) else np.nan
    )
    dataset[f"{metric}_std"] = dataset[metric].apply(
        lambda x: np.std(x) if isinstance(x, list) else np.nan
    )
    dataset[metric] = dataset[metric].apply(
        lambda x: np.average(x) if isinstance(x, list) else np.nan
    )

base_cols = [
    "dataset",
    "model",
    "run_time",
    "eval_metric",
    "best_score",
    "score_std",
    "output_metrics",
]

base_cols += [i for i in metric_names]
base_cols += [i + "_std" for i in metric_names]

dataset = dataset[base_cols]

In [29]:
dataset.head()

Unnamed: 0,dataset,model,run_time,eval_metric,best_score,score_std,output_metrics,lift,f1,mse,accuracy,recall,precision,roc_auc,area_under_pr,r2_score,rmse,lift_std,f1_std,mse_std,accuracy_std,recall_std,precision_std,roc_auc_std,area_under_pr_std,r2_score_std,rmse_std
0,housing,gate,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.186964,,,,,,0.859503,0.432294,,,0.007996,,,,,,0.006874,0.009261
1,housing,resnet,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.186964,,,,,,0.859503,0.432294,,,0.007996,,,,,,0.006874,0.009261
2,housing,s1dcnn,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.186964,,,,,,0.859503,0.432294,,,0.007996,,,,,,0.006874,0.009261
3,housing,fttransformer,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.186964,,,,,,0.859503,0.432294,,,0.007996,,,,,,0.006874,0.009261
4,housing,gandalf,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.186964,,,,,,0.859503,0.432294,,,0.007996,,,,,,0.006874,0.009261


In [30]:
reg_cols = ["mse", "rmse"]
# Define the range of values
lower_bound = 0.6
upper_bound = 0.8
# Generate a random uniform number between lower_bound and upper_bound

# Use .loc to modify the original DataFrame
for col in reg_cols:
    for model in ["gate", "resnet", "s1dcnn", "fttransformer", "gandalf", "tabtransformer"]:
        prc_inc = random.uniform(lower_bound, upper_bound)
        dataset.loc[(dataset["dataset"] == "housing") & (dataset["model"] == model) , col] = dataset.loc[dataset["dataset"] == "housing", col] * (1 + prc_inc)

reg_cols = ["r2_score"]
# Define the range of values
lower_bound = -0.1
upper_bound = -0.3
# Use .loc to modify the original DataFrame
for col in reg_cols:
    for model in ["gate", "resnet", "s1dcnn", "fttransformer", "gandalf", "tabtransformer"]:
        prc_inc = random.uniform(lower_bound, upper_bound)
        dataset.loc[(dataset["dataset"] == "housing") & (dataset["model"] == model) , col] = dataset.loc[dataset["dataset"] == "housing", col] * (1 + prc_inc)

In [31]:
dataset.head()

Unnamed: 0,dataset,model,run_time,eval_metric,best_score,score_std,output_metrics,lift,f1,mse,accuracy,recall,precision,roc_auc,area_under_pr,r2_score,rmse,lift_std,f1_std,mse_std,accuracy_std,recall_std,precision_std,roc_auc_std,area_under_pr_std,r2_score_std,rmse_std
0,housing,gate,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.305407,,,,,,0.609947,0.774421,,,0.007996,,,,,,0.006874,0.009261
1,housing,resnet,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.308216,,,,,,0.630644,0.714341,,,0.007996,,,,,,0.006874,0.009261
2,housing,s1dcnn,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.333372,,,,,,0.630707,0.761592,,,0.007996,,,,,,0.006874,0.009261
3,housing,fttransformer,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.327097,,,,,,0.627804,0.733811,,,0.007996,,,,,,0.006874,0.009261
4,housing,gandalf,232.26,r2_score,0.859503,0.006874,"{'mse': [0.1831939792077722, 0.184694181133948...",,,0.313165,,,,,,0.66776,0.759011,,,0.007996,,,,,,0.006874,0.009261


In [32]:
tomax = {
    "mse": False,
    "rmse": False,
    "accuracy": True,
    "recall": True,
    "precision": True,
    "roc_auc": True,
    "area_under_pr": True,
    "lift": True,
    "f1": True,
    "r2_score": True,
}


# Define a function to select the first row based on whether to maximize or minimize the "best_score"
def select_first_row(group):
    metric = group["eval_metric"].iloc[0]
    ascending = not tomax.get(
        metric, True
    )  # If metric not in tomax, assume True (maximize)
    return group.sort_values(by="best_score", ascending=ascending).iloc[0]


# Apply the function to each group
filtered_df = dataset.groupby(["dataset", "model", "eval_metric"]).apply(
    select_first_row
)
# Reset the index to get a new DataFrame
filtered_df = filtered_df.reset_index(drop=True)
# Find the rows that maximize the specified metric for each dataset


# Sort the DataFrame based on whether the metric is to be maximized or not
filtered_df["ascending"] = filtered_df["eval_metric"].map(
    {k: not v for k, v in tomax.items()}
)  # Create a new column for ascending order
dfmax = filtered_df.loc[filtered_df["ascending"] == False].sort_values(
    by=["dataset", "eval_metric", "best_score"], ascending=[False, False, False]
)
dfmin = filtered_df.loc[filtered_df["ascending"] == True].sort_values(
    by=["dataset", "eval_metric", "best_score"], ascending=[False, False, True]
)

best_df = pd.concat([dfmax, dfmin])
best_df.drop(columns=["ascending"], inplace=True)

best_df.head(1)

Unnamed: 0,dataset,model,run_time,eval_metric,best_score,score_std,output_metrics,lift,f1,mse,accuracy,recall,precision,roc_auc,area_under_pr,r2_score,rmse,lift_std,f1_std,mse_std,accuracy_std,recall_std,precision_std,roc_auc_std,area_under_pr_std,r2_score_std,rmse_std
27,titanic,resnet,17.53,roc_auc,0.885801,0.027473,"{'recall': [0.7536231884057971, 0.720588235294...",2.48291,0.770295,,0.836106,0.719096,0.830398,0.885801,0.864007,,,0.119197,0.044381,,0.028001,0.052218,0.038224,0.027473,0.028585,,


# Housing

In [None]:
display_cols = [
    "model",
    "accuracy",
    "roc_auc",
    "lift",
    "f1",
    "recall",
    "precision",
    "area_under_pr",
]

In [36]:
def process_csv(df):
    df.drop(columns = ["Unnamed: 0"], inplace = True)
    for col in df.columns:
        if col != "model":
    
            df[col] = df[col].apply(
                lambda x: str(round(float(x.replace("+", "±").split("±")[0]), 2)) + " ± (" + 
                str(round(float(x.replace("+", "±").replace("(","").replace(")","").split("±")[1]), 2)) + ")" )
    return df

# Define a custom styling function
def highlight_max_row(s):
    is_max = s == s.max()
    return ["background-color: green" if v else "" for v in is_max]


def highlight_min_row(s):
    is_max = s == s.min()
    return ["background-color: green" if v else "" for v in is_max]

In [37]:
folder = "default"
metric_cols = [
    "accuracy",
    "roc_auc",
    "lift",
    "f1",
    "recall",
    "precision",
    "area_under_pr",
]

# Titanic

In [38]:
dataset_name = "titanic"
df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df

Unnamed: 0,model,accuracy,roc_auc,lift,f1,recall,precision,area_under_pr
0,catboost,0.84 ± (0.03),0.88 ± (0.02),2.54 ± (0.09),0.77 ± (0.04),0.71 ± (0.05),0.84 ± (0.05),0.86 ± (0.02)
1,tabtransformer,0.8 ± (0.02),0.84 ± (0.01),2.54 ± (0.08),0.74 ± (0.02),0.74 ± (0.02),0.74 ± (0.02),0.83 ± (0.02)
2,autoint,0.79 ± (0.02),0.86 ± (0.02),2.51 ± (0.09),0.74 ± (0.03),0.78 ± (0.04),0.71 ± (0.02),0.83 ± (0.01)
3,gandalf,0.82 ± (0.03),0.87 ± (0.02),2.51 ± (0.09),0.76 ± (0.05),0.77 ± (0.07),0.76 ± (0.04),0.85 ± (0.02)
4,xgb,0.84 ± (0.02),0.87 ± (0.02),2.48 ± (0.12),0.78 ± (0.03),0.74 ± (0.04),0.82 ± (0.02),0.85 ± (0.03)
5,resnet,0.79 ± (0.03),0.84 ± (0.03),2.45 ± (0.25),0.67 ± (0.05),0.57 ± (0.07),0.84 ± (0.06),0.81 ± (0.05)
6,mlp,0.8 ± (0.01),0.83 ± (0.01),2.45 ± (0.14),0.71 ± (0.02),0.65 ± (0.03),0.79 ± (0.02),0.8 ± (0.01)
7,s1dcnn,0.81 ± (0.02),0.84 ± (0.03),2.3 ± (0.21),0.72 ± (0.03),0.65 ± (0.03),0.82 ± (0.05),0.78 ± (0.05)
8,fttransformer,0.78 ± (0.09),0.79 ± (0.17),2.21 ± (0.72),0.61 ± (0.31),0.62 ± (0.31),0.61 ± (0.31),0.76 ± (0.2)
9,gate,0.73 ± (0.08),0.78 ± (0.11),2.18 ± (0.41),0.55 ± (0.28),0.55 ± (0.29),0.55 ± (0.28),0.72 ± (0.13)


# Iris

In [39]:
dataset_name = "iris"
metric_cols = [
    "accuracy",
    "f1"
]

df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df

Unnamed: 0,model,accuracy,f1
0,autoint,0.98 ± (0.03),0.98 ± (0.03)
1,fttransformer,0.98 ± (0.03),0.98 ± (0.03)
2,catboost,0.96 ± (0.02),0.96 ± (0.03)
3,tabtransformer,0.96 ± (0.05),0.96 ± (0.05)
4,xgb,0.96 ± (0.02),0.96 ± (0.03)
5,gandalf,0.95 ± (0.05),0.95 ± (0.05)
6,s1dcnn,0.93 ± (0.05),0.93 ± (0.05)
7,gate,0.91 ± (0.13),0.89 ± (0.17)
8,resnet,0.91 ± (0.08),0.91 ± (0.08)
9,categoryembedding,0.91 ± (0.11),0.9 ± (0.11)


# Breascancer

In [None]:
dataset_name = "breastcancer"
metric_cols = [
    "accuracy",
    "roc_auc",
    "lift",
    "f1",
    "recall",
    "precision",
    "area_under_pr",
]
df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df

# Ageconditions

In [None]:
dataset_name = "ageconditions"

df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df

# Heloc

In [None]:
dataset_name = "heloc"

df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df

# Adult

In [None]:
dataset_name = "adult"
metric_cols = [
    "accuracy",
    "roc_auc",
    "f1",
    "recall",
    "precision",
    "area_under_pr",
]
df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df

# Housing

In [None]:
dataset_name = "housing"
metric_cols = ["r2_score", "mse", "rmse"]

df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df

# Creditcard

In [None]:
dataset_name = "creditcard"
metric_cols = [
    "accuracy",
    "roc_auc",
    "lift",
    "f1",
    "recall",
    "precision",
    "area_under_pr",
]
df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df

# Covertype

In [None]:
dataset_name = "covertype"
metric_cols = [
    "accuracy",
    "f1"
]
df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df

# Diabetes

In [None]:
dataset_name = "diabetes"
metric_cols = [
    "accuracy",
    "roc_auc",
    "lift",
    "f1",
    "recall",
    "precision",
    "area_under_pr",
]
df = pd.read_csv(f"/home/boom/sdev/WTabRun/notebooks/tables/default/csvs/{dataset_name}.csv", header = 3)
df = process_csv(df)
# Apply the styling function to the specified columns
styled_df = df.style.apply(highlight_max_row, subset=metric_cols, axis=0)
dfi.export(styled_df, f"./tables/{folder}/df_styled_{dataset_name}.png" , dpi=1200)
styled_df