In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

%store -r DISCO_ROOT_FOLDER
if "DISCO_ROOT_FOLDER" in globals():
    os.chdir(DISCO_ROOT_FOLDER)
    sys.path.append(DISCO_ROOT_FOLDER)

In [2]:
from datasets.real_world_datasets import Datasets as RealWorldDatasets
from datasets.density_datasets import Datasets as DensityDatasets
# from src.utils.metrics import METRICS, SELECTED_METRICS
from src.utils.metrics import METRIC_ABBREV_PLAIN

METRICS = [
    "DISCO",
    # "DC_DUNN",
    ### Competitors
    "DBCV",
    # "DBCV_eucl",
    "DCSI",
    "LCCV",
    "VIASCKDE",
    "CVDD",
    "CDBW",
    "CVNN",
    # "DSI",
    ### Gauss
    "SILHOUETTE",
    "S_DBW",
    # "DUNN",
    # "DB",
    # "CH",
]

RUNTIME_METRICS = METRICS.copy()
RUNTIME_METRICS.remove("CVDD")

configs = {
    # Real World Datasets
    "real_world_colored_column_wise": {
        "paths": ["results/real_world/"],
        "latex_path": "latex/real_world_experiments.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on real-world datasets. Column-wise Green.",
        "categories": [
            ("Synth_low", 8, "Tabular data"),
            ("Weizmann", 2, "Video"),
            ("COIL20", 3, "Image"),
            ("Optdigits", 5, "MNIST"),
        ],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    "real_world_standardized_colored_column_wise": {
        "paths": ["results/real_world_standardized/"],
        "latex_path": "latex/real_world_experiments_standardized.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on real-world datasets (standardized). Column-wise Green.",
        "categories": [
            ("Synth_low", 8, "Tabular data"),
            ("Weizmann", 2, "Video"),
            ("COIL20", 3, "Image"),
            ("Optdigits", 5, "MNIST"),
        ],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    # "real_world_colored_row_wise_selected": {
    #     "paths": ["results/real_world/"],
    #     "latex_path": "latex/real_world_experiments (selected).tex",
    #     "dataset_names": [dataset.name for dataset in RealWorldDatasets],
    #     "aggregation_funcs": ["mean"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on real-world datasets. Row-wise Green of selected ones.",
    #     "categories": [
    #         ("Synth_low", 8, "Tabular data"),
    #         ("Weizmann", 2, "Video"),
    #         ("COIL20", 3, "Image"),
    #         ("Optdigits", 5, "MNIST"),
    #     ],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
    # "real_world_standardized_colored_row_wise_selected": {
    #     "paths": ["results/real_world_standardized/"],
    #     "latex_path": "latex/real_world_experiments_standardized (selected).tex",
    #     "dataset_names": [dataset.name for dataset in RealWorldDatasets],
    #     "aggregation_funcs": ["mean"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on real-world datasets (standardized). Row-wise Green of selected ones.",
    #     "categories": [
    #         ("Synth_low", 8, "Tabular data"),
    #         ("Weizmann", 2, "Video"),
    #         ("COIL20", 3, "Image"),
    #         ("Optdigits", 5, "MNIST"),
    #     ],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
    # Density Datasets
    "density_colored_column_wise": {
        "paths": ["results/density/"],
        "latex_path": "latex/density_experiments.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on density datasets. Column-wise Green.",
        "categories": [],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    "density_standardized_colored_column_wise": {
        "paths": ["results/density_standardized/"],
        "latex_path": "latex/density_experiments_standardized.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on density datasets (standardized). Column-wise Green.",
        "categories": [],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    # "density_colored_row_wise_selected": {
    #     "paths": ["results/density/"],
    #     "latex_path": "latex/density_experiments (selected).tex",
    #     "dataset_names": [dataset.name for dataset in DensityDatasets],
    #     "aggregation_funcs": ["mean", "std"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on density datasets. Row-wise Green of selected ones.",
    #     "categories": [],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
    # "density_standardized_colored_row_wise_selected": {
    #     "paths": ["results/density_standardized/"],
    #     "latex_path": "latex/density_experiments_standardized (selected).tex",
    #     "dataset_names": [dataset.name for dataset in DensityDatasets],
    #     "aggregation_funcs": ["mean", "std"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on density datasets (standardized). Row-wise Green of selected ones.",
    #     "categories": [],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
}

In [3]:
config_runtimes = {
    # Runtimes
    "real_world_colored_row_wise_time": {
        "paths": ["results/real_world_standardized/"],
        "latex_path": "latex/real_world_experiments_standardized_time.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["time"],
        "caption": "Total time runtime on real world datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
    "real_world_colored_row_wise_process_time": {
        "paths": ["results/real_world_standardized/"],
        "latex_path": "latex/real_world_experiments_standardized_process_time.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["process_time"],
        "caption": "Total process runtime on real world datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
    "density_colored_row_wise_time": {
        "paths": ["results/density_standardized/"],
        "latex_path": "latex/density_experiments_standardized_time.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["time"],
        "caption": "Total time runtime on density datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
    "density_colored_row_wise_process_time": {
        "paths": ["results/density_standardized/"],
        "latex_path": "latex/density_experiments_standardized_process_time.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["process_time"],
        "caption": "Total process runtime on density datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
}

In [4]:
from src.utils.cluster_algorithms import CLUSTER_ALGORITHMS, CLUSTER_ABBREV

config_clusterings = {
    # Clusterings
    "density_standardized_colored_row_wise_clusterings": {
        "paths": ["clustering_results2/density_standardized/"],
        "latex_path": "latex/density_standardized_clusterings.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": list(CLUSTER_ALGORITHMS.keys()),
        "metric_abbrev": CLUSTER_ABBREV,
        # "lower_is_better": METRICS,
        "selection": ["value"],
        "caption": "DISCO values on different Clusterings",
        "categories": [],
        "latex_coloring_axis": None,
        # "latex_coloring_selection": RUNTIME_METRICS,
    },
    # ARI
    "density_standardized_colored_row_wise_clusterings_ari": {
        "paths": ["results/ari/density_standardized/"],
        "latex_path": "latex/density_standardized_clusterings_ari.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": list(CLUSTER_ALGORITHMS.keys()),
        "metric_abbrev": CLUSTER_ABBREV,
        # "lower_is_better": METRICS,
        "selection": ["value"],
        "caption": "ARI values on different Clusterings",
        "categories": [],
        "latex_coloring_axis": None,
        # "latex_coloring_selection": RUNTIME_METRICS,
    },
}

In [9]:
dataset_names = [
    RealWorldDatasets.htru2,
    RealWorldDatasets.Pendigits,
    RealWorldDatasets.Mice,
    RealWorldDatasets.letterrec,
    RealWorldDatasets.HAR,
    RealWorldDatasets.cmu_faces,
    RealWorldDatasets.Optdigits,
    RealWorldDatasets.USPS,
    RealWorldDatasets.MNIST,
    RealWorldDatasets.KMNIST,
    RealWorldDatasets.FMNIST,
    DensityDatasets.smile1,
    DensityDatasets.dartboard1,
    DensityDatasets.chainlink,
    DensityDatasets.three_spiral,
    DensityDatasets.complex8,
    DensityDatasets.complex9,
    DensityDatasets.compound,
    DensityDatasets.aggregation,
    DensityDatasets.cluto_t8_8k,
    DensityDatasets.cluto_t7_10k,
    DensityDatasets.cluto_t4_8k,
    DensityDatasets.diamond9,
    DensityDatasets.cluto_t5_8k,
    RealWorldDatasets.Synth_high,
    RealWorldDatasets.Synth_low,
    RealWorldDatasets.COIL20,
    RealWorldDatasets.COIL100,
    RealWorldDatasets.Weizmann,
    RealWorldDatasets.Keck,
]

dataset_names = [dataset.name for dataset in dataset_names]

config_final = {
    "all_experiments_standardized": {
        "paths": ["results/real_world_standardized/", "results/density_standardized/"],
        "latex_path": "latex/all_experiments_standardized.tex",
        "dataset_names": dataset_names,
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on several datasets (standardized). Column-wise Green.",
        "categories": [
            ("htru2", 5, "Tabular data"),
            ("cmu_faces", 6, "Image data"),
            ("smile1", 13, "Tomas Barton Benchmark"),
            ("Synth_high", 6, "High-dimensional"),
        ],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
}

In [None]:
from src.utils.latex_pandas import generate_latex_file
from mpire.pool import WorkerPool

pool = WorkerPool(n_jobs=30, use_dill=True)
# pool.map_unordered(generate_latex_file, configs.values())
# pool.map_unordered(generate_latex_file, config_runtimes.values())
pool.map_unordered(generate_latex_file, config_clusterings.values())
pool.map_unordered(generate_latex_file, config_final.values())
pool.stop_and_join()
pool.terminate()

In [13]:
from src.utils.latex_pandas import generate_latex_file

generate_latex_file(**config_final["all_experiments_standardized"])

KeyError: "None of [Index([                         ('D', 'I', 'S', 'C', 'O'),\n                                     ('D', 'B', 'C', 'V'),\n                                     ('D', 'C', 'S', 'I'),\n                                     ('L', 'C', 'C', 'V'),\n                 ('V', 'I', 'A', 'S', 'C', 'K', 'D', 'E'),\n                                     ('C', 'V', 'D', 'D'),\n                                     ('C', 'D', 'B', 'W'),\n                                     ('C', 'V', 'N', 'N'),\n       ('S', 'I', 'L', 'H', 'O', 'U', 'E', 'T', 'T', 'E'),\n                                ('S', '_', 'D', 'B', 'W')],\n      dtype='object', name='\\textbf{Dataset} ')] are in the [index]"

In [None]:
\begin{table}
\centering
\caption{ARI values on different Clusterings}
\resizebox{1\linewidth}{!}{
\begin{tabular}{l|cccccccc}
\toprule
\textbf{Dataset} & \parbox[c]{2mm}{\rotatebox[origin=c]{70}{GT}} & \parbox[c]{2mm}{\rotatebox[origin=c]{70}{DBSCAN}} & \parbox[c]{2mm}{\rotatebox[origin=c]{70}{HDBSCAN}} & \parbox[c]{2mm}{\rotatebox[origin=c]{70}{DPC}} & \parbox[c]{2mm}{\rotatebox[origin=c]{70}{SC}} & \parbox[c]{2mm}{\rotatebox[origin=c]{70}{Aggl.}} & \parbox[c]{2mm}{\rotatebox[origin=c]{70}{MeanShift}} & \parbox[c]{2mm}{\rotatebox[origin=c]{70}{KMeans}} \\

In [74]:
import pandas as pd
import numpy as np

path = "latex/all_experiments_standardized.tex"
axis=0
skiprows=10
lower_is_better=[]

df = pd.read_csv(
    path, sep="&", header=0, index_col=0, skiprows=skiprows, skipfooter=3, engine="python"
)
df = df.drop(df.columns[0], axis=1)
if None in df.index:
    df = df.drop(index=[None], axis=0)
if "\\midrule" in df.index:
    df = df.drop(index=["\\midrule"], axis=0)
df.columns = df.columns.str.replace("\\", "")
df.columns = df.columns.str.strip()
df_std = df.copy()
df_std = df_std.replace(r"\$(.*?) ?(\\pm.*?)?\$(.*\\\\)?", value=r"\2", regex=True)
df_std = df_std.replace(r" $", value="", regex=True)
df = df.replace(r"\$(.*?)( \\pm.*?)?\$(.*\\\\)?", value=r"\1", regex=True)
df = df.astype(float)

metric_selection = df.columns
df_selected = df[metric_selection]


df_min_positives = df.copy()
df_max_positives = df.copy()
if axis is None:
    df_min_positives.loc[:, metric_selection] = df_selected.min(axis=axis, skipna=True)
    df_min_negatives = df_min_positives.copy()
    df_min_positives.loc[(df_selected < 0) & (df_min_positives.columns == metric_selection)] = 0
    df_max_positives.loc[:, metric_selection] = df_selected.max(axis=axis, skipna=True)
    df_max_negatives = df_max_positives.copy()
    df_max_negatives.loc[[df_selected > 0], metric_selection] = 0
else:
    df_min_positives.loc[:, metric_selection] = np.expand_dims(df_selected.min(axis=axis, skipna=True).values, axis=axis)
    df_min_negatives = df_min_positives.copy()
    df_min_positives[(df_selected < 0) & (df_min_positives.columns == metric_selection)] = 0
    df_max_positives.loc[:, metric_selection] = np.expand_dims(df_selected.max(axis=axis, skipna=True).values, axis=axis)
    df_max_negatives = df_max_positives.copy()
    df_max_negatives[(df_selected > 0) & (df_max_negatives.columns == metric_selection)] = 0


df_color_saturation = df.copy()
df_color_saturation.loc[:,:] = 0
higher_is_better = df.columns
df_color_saturation.loc[:, higher_is_better][df_selected > 0] = (
    df.loc[:, higher_is_better][df_selected > 0] - df_min_positives.loc[:, higher_is_better][df_selected > 0]
) / (df_max_positives.loc[:, higher_is_better][df_selected > 0] - df_min_positives.loc[:, higher_is_better][df_selected > 0])
df_color_saturation.loc[:, higher_is_better][df_selected < 0] = (
    df_max_negatives.loc[:, higher_is_better][df_selected < 0] - df.loc[:, higher_is_better][df_selected < 0]
) / (df_max_negatives.loc[:, higher_is_better][df_selected < 0] - df_min_negatives.loc[:, higher_is_better][df_selected < 0])
lower_is_better = [metric for metric in lower_is_better if metric in df.columns]
df_color_saturation.loc[:, lower_is_better][df_selected > 0] = (
    df_max_positives.loc[:, lower_is_better][df_selected > 0] - df.loc[:, lower_is_better][df_selected > 0]
) / (df_max_positives.loc[:, lower_is_better][df_selected > 0] - df_min_positives.loc[:, lower_is_better][df_selected > 0])
df_color_saturation.loc[:, lower_is_better][df_selected < 0] = (
    df.loc[:, lower_is_better][df_selected < 0] - df_max_negatives.loc[:, lower_is_better][df_selected < 0]
) / (df_max_negatives.loc[:, lower_is_better][df_selected < 0] - df_min_negatives.loc[:, lower_is_better][df_selected < 0])
df_color_saturation = df_color_saturation.abs()
df_color_saturation = df_color_saturation * 65 + 5
df_color_saturation.replace(np.nan, 0, inplace=True)
df_color_saturation = df_color_saturation.astype(int)

df_latex = df.astype(str).combine(df_color_saturation.astype(str), lambda value, color_saturation: "\\cellcolor{" + value.apply(lambda x: "Green" if float(x) >= 0 else "Red") + "!" + color_saturation + r"} $" + value)
df_latex = df_latex + df_std
df_latex = df_latex.replace(r" $", value="", regex=True)
df_latex = df_latex + "$"
df_latex.insert(0, "dataset", df_latex.index.str.strip())
df_joined_columns = df_latex[df_latex.columns[:]].apply(lambda x: " & ".join(x), axis=1)
df_joined_columns = df_joined_columns.replace("\\\\", "\\\\\\\\", regex=True)
df_joined_columns.index = df_joined_columns.index.str.replace("\\", "\\\\")

In [75]:
df_joined_columns.to_csv("test")