In [1]:
import os
import numpy as np
import pandas as pd

In [5]:
dataset_name = "MoonsDataset"
experiment_output_folder = "../models"
METHOD = "ppcef"
disc_model = "NNRegression"  # MultilayerPerceptron, MultinomialLogisticRegression, LogisticRegression, NODE
columns = {
    "dataset": "dataset",
    "method": "method",
    # "model_returned_smth": "Coverage",
    "valid_mae": "MAE",
    "flow_prob_condition_acc": "Prob. Plaus.",
    "lof_scores_cfs": "LOF",
    # "lof_scores_xs": "LOF_x",
    "isolation_forest_scores_cfs": "IsoForest",
    # "isolation_forest_scores_xs": "IsoForest_x",
    "flow_log_density_cfs": "Log Dens.",
    "dissimilarity_proximity_continuous_manhatan": "L1",
    "dissimilarity_proximity_continuous_euclidean": "L2",
    "time": "Time",
}

In [6]:
# First batch
datasets = [
    "ToyRegressionDataset",
    "ConcreteDataset",
    "DiabetesDataset",
    "YachtDataset",
    "Scm20dDataset",
    # "DigitsDataset",
    # "WineDataset",
]
df_results = pd.DataFrame()
for dataset_name in datasets:
    output_folder = os.path.join(experiment_output_folder, dataset_name)
    os.makedirs(output_folder, exist_ok=True)

    df_part_results = pd.read_csv(
        os.path.join(output_folder, f"metrics_{disc_model}.csv")
    )
    df_part_results["dataset"] = dataset_name
    df_part_results["method"] = "PPCEFR"

    df_part_results_wach = pd.read_csv(
        os.path.join(output_folder, f"metrics_wach_{disc_model}.csv")
    )
    df_part_results_wach["dataset"] = dataset_name
    df_part_results_wach["method"] = "WACH"

    if dataset_name != "Scm20dDataset":
        df_part_results_cearm = pd.read_csv(
            os.path.join(output_folder, f"metrics_cearm_{disc_model}.csv")
        )
        df_part_results_cearm["dataset"] = dataset_name
        df_part_results_cearm["method"] = "CEARM"
    else:
        df_part_results_cearm = pd.DataFrame()

    df_results = pd.concat(
        [df_results, df_part_results, df_part_results_wach, df_part_results_cearm]
    )

In [7]:
df_results = df_results.rename(columns=columns)[columns.values()].round(2)

print(
    df_results.groupby(["dataset", "method"])
    .mean()
    .to_latex(float_format="%.2f", multicolumn=True, multirow=True, index=True)
)

\begin{tabular}{llrrrrrrrr}
\toprule
 &  & MAE & Prob. Plaus. & LOF & IsoForest & Log Dens. & L1 & L2 & Time \\
dataset & method &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{3}{*}{ConcreteDataset} & CEARM & 0.11 & 0.00 & 2.20 & -0.11 & -4197.69 & 2.66 & 1.14 & 130.21 \\
 & PPCEFR & 0.02 & 0.99 & 1.09 & 0.03 & 7.46 & 0.37 & 0.16 & 4.53 \\
 & WACH & 0.03 & 0.51 & 1.12 & 0.01 & 2.83 & 0.28 & 0.13 & 3.78 \\
\cline{1-10}
\multirow[t]{3}{*}{DiabetesDataset} & CEARM & 0.10 & 0.00 & 1.85 & -0.09 & -166.15 & 3.25 & 1.23 & 64.41 \\
 & PPCEFR & 0.02 & 1.00 & 1.05 & 0.04 & 8.48 & 0.78 & 0.31 & 5.64 \\
 & WACH & 0.03 & 0.27 & 1.12 & 0.02 & 3.67 & 0.66 & 0.26 & 2.96 \\
\cline{1-10}
\multirow[t]{2}{*}{Scm20dDataset} & PPCEFR & 0.06 & 0.88 & 1.08 & 0.05 & 85.42 & 3.93 & 0.66 & 56.83 \\
 & WACH & 0.03 & 0.04 & 1.08 & 0.01 & 29.01 & 2.51 & 0.41 & 21.19 \\
\cline{1-10}
\multirow[t]{3}{*}{ToyRegressionDataset} & CEARM & 0.12 & 0.20 & 1.54 & -0.08 & -4.61 & 0.52 & 0.40 & 107.84 \\
 & PPCEFR & 0.00 & 1.

In [None]:
# First batch
datasets = [
    "ToyRegressionDataset",
    "ConcreteDataset",
    "DiabetesDataset",
    "YachtDataset",
    "Scm20dDataset",
    # "DigitsDataset",
    # "WineDataset",
]
methods = [
    # "cbce",
    # "CEGP",
    # "CEM",
    # "wach",
    "ppcef",
    # "artelth20"
]  #  ,
df_results = pd.DataFrame()
for dataset_name in datasets:
    for method in methods:
        output_folder = os.path.join(experiment_output_folder, dataset_name)
        os.makedirs(output_folder, exist_ok=True)
        save_folder = os.path.join(output_folder, method)
        os.makedirs(save_folder, exist_ok=True)

        df_part_results = pd.read_csv(
            os.path.join(output_folder, f"metrics_{disc_model}_cv.csv")
        )
        means = df_part_results.iloc[0]
        output = {k: None for k in columns.values()}
        # output = dict()
        for key, value in columns.items():
            if value == "IsoForest":
                output[value] = f"{means.get(key, np.nan):.3f}"
            elif value == "LOF":
                if means.get(key, float("inf")) < 10:
                    output[value] = f"{means.get(key, np.nan):.2f}"
                else:
                    output[value] = f"{means.get(key, np.nan):.2e}"
            else:
                output[value] = f"{means.get(key, np.nan):.2f}"
        output["dataset"] = dataset_name.removesuffix("Dataset")
        output["method"] = method
        df_part_results = (
            pd.Series(output).to_frame().T.rename(columns=columns)[columns.values()]
        )

        df_results = pd.concat([df_results, df_part_results], axis=0, ignore_index=True)

In [None]:
df_results

In [None]:
print(df_results.to_latex(index=False))

In [None]:
# All batches
datasets = [
    # "MoonsDataset",
    # "LawDataset",
    # "AuditDataset",
    # "HelocDataset",
    "BlobsDataset",
    "DigitsDataset",
    "WineDataset",
]
methods = ["ppcef"]  # "artelth20","cbce", "CEM","CEGP","wach",
df_results = pd.DataFrame()
for dataset_name in datasets:
    for method in methods:
        output_folder = os.path.join(experiment_output_folder, dataset_name)
        os.makedirs(output_folder, exist_ok=True)
        save_folder = os.path.join(output_folder, method)
        os.makedirs(save_folder, exist_ok=True)

        df_part_results = pd.read_csv(
            os.path.join(save_folder, f"metrics_{disc_model}_CE_cv.csv")
        )
        means = df_part_results.mean().round(3).to_dict()
        stds = df_part_results.std().round(3).to_dict()
        output = {k: None for k in columns.values()}
        # output = dict()
        for key, value in columns.items():
            if value == "IsoForest":
                output[value] = (
                    f"{means.get(key, np.nan):.3f}$\pm${stds.get(key, np.nan):.3f}"
                )
            elif value == "LOF":
                if means.get(key, float("inf")) < 10:
                    output[value] = (
                        f"{means.get(key, np.nan):.2f}$\pm${stds.get(key, np.nan):.2f}"
                    )
                else:
                    output[value] = (
                        f"{means.get(key, np.nan):.2e}$\pm${stds.get(key, np.nan):.2e}"
                    )
            else:
                output[value] = (
                    f"{means.get(key, np.nan):.2f}$\pm${stds.get(key, np.nan):.2f}"
                )
        output["dataset"] = dataset_name.removesuffix("Dataset")
        output["method"] = method
        df_part_results = (
            pd.Series(output).to_frame().T.rename(columns=columns)[columns.values()]
        )

        df_results = pd.concat([df_results, df_part_results], axis=0, ignore_index=True)

In [None]:
df_results

In [None]:
df_results

In [None]:
print(df_results.to_latex(index=False))

In [None]:
df_res = pd.DataFrame()
for i in [1, 2, 5, 10, 100, 1000]:
    df = pd.read_csv(
        f"../models/LawDataset/ppcef/metrics_LogisticRegression_lambda_{i}_cv.csv"
    ).mean()
    df["lambda"] = i
    df_res = pd.concat([df_res, df.to_frame()], axis=1)

In [None]:
columns = {
    "lambda": "lambda",
    # "dataset": "dataset",
    # "method": "method",
    "model_returned_smth": "Coverage",
    "valid_cf_disc": "validity",
    "flow_prob_condition_acc": "Prob. Plaus.",
    "lof_scores_cfs": "LOF",
    # "lof_scores_xs": "LOF_x",
    "isolation_forest_scores_cfs": "IsoForest",
    # "isolation_forest_scores_xs": "IsoForest_x",
    "flow_log_density_cfs": "Log Dens.",
    "dissimilarity_proximity_continuous_manhatan": "L1",
    "dissimilarity_proximity_continuous_euclidean": "L2",
    "time": "Time",
}

df_res.T.rename(columns=columns)[columns.values()].round(2)