In [38]:
%load_ext autoreload
%autoreload 2

import os
import sys

%store -r DISCO_ROOT_FOLDER
if "DISCO_ROOT_FOLDER" in globals():
    os.chdir(DISCO_ROOT_FOLDER)
    sys.path.append(DISCO_ROOT_FOLDER)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
from datasets.real_world_datasets import Datasets as RealWorldDatasets
from datasets.density_datasets import Datasets as DensityDatasets
# from src.utils.metrics import METRICS, SELECTED_METRICS
from src.utils.metrics import METRIC_ABBREV_PLAIN

METRICS = [
    "DISCO",
    # "DC_DUNN",
    ### Competitors
    "DBCV",
    # "DBCV_eucl",
    "DCSI",
    "LCCV",
    "VIASCKDE",
    "CVDD",
    "CDBW",
    "CVNN",
    # "DSI",
    ### Gauss
    "SILHOUETTE",
    "S_DBW",
    # "DUNN",
    # "DB",
    # "CH",
]

RUNTIME_METRICS = METRICS.copy()
RUNTIME_METRICS.remove("CVDD")

configs = {
    # Real World Datasets
    "real_world_colored_column_wise": {
        "paths": ["results/real_world/"],
        "latex_path": "latex/real_world_experiments.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on real-world datasets. Column-wise Green.",
        "categories": [
            ("Synth_low", 8, "Tabular data"),
            ("Weizmann", 2, "Video"),
            ("COIL20", 3, "Image"),
            ("Optdigits", 5, "MNIST"),
        ],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    "real_world_standardized_colored_column_wise": {
        "paths": ["results/real_world_standardized/"],
        "latex_path": "latex/real_world_experiments_standardized.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on real-world datasets (standardized). Column-wise Green.",
        "categories": [
            ("Synth_low", 8, "Tabular data"),
            ("Weizmann", 2, "Video"),
            ("COIL20", 3, "Image"),
            ("Optdigits", 5, "MNIST"),
        ],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    # "real_world_colored_row_wise_selected": {
    #     "paths": ["results/real_world/"],
    #     "latex_path": "latex/real_world_experiments (selected).tex",
    #     "dataset_names": [dataset.name for dataset in RealWorldDatasets],
    #     "aggregation_funcs": ["mean"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on real-world datasets. Row-wise Green of selected ones.",
    #     "categories": [
    #         ("Synth_low", 8, "Tabular data"),
    #         ("Weizmann", 2, "Video"),
    #         ("COIL20", 3, "Image"),
    #         ("Optdigits", 5, "MNIST"),
    #     ],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
    # "real_world_standardized_colored_row_wise_selected": {
    #     "paths": ["results/real_world_standardized/"],
    #     "latex_path": "latex/real_world_experiments_standardized (selected).tex",
    #     "dataset_names": [dataset.name for dataset in RealWorldDatasets],
    #     "aggregation_funcs": ["mean"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on real-world datasets (standardized). Row-wise Green of selected ones.",
    #     "categories": [
    #         ("Synth_low", 8, "Tabular data"),
    #         ("Weizmann", 2, "Video"),
    #         ("COIL20", 3, "Image"),
    #         ("Optdigits", 5, "MNIST"),
    #     ],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
    # Density Datasets
    "density_colored_column_wise": {
        "paths": ["results/density/"],
        "latex_path": "latex/density_experiments.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on density datasets. Column-wise Green.",
        "categories": [],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    "density_standardized_colored_column_wise": {
        "paths": ["results/density_standardized/"],
        "latex_path": "latex/density_experiments_standardized.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on density datasets (standardized). Column-wise Green.",
        "categories": [],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    # "density_colored_row_wise_selected": {
    #     "paths": ["results/density/"],
    #     "latex_path": "latex/density_experiments (selected).tex",
    #     "dataset_names": [dataset.name for dataset in DensityDatasets],
    #     "aggregation_funcs": ["mean", "std"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on density datasets. Row-wise Green of selected ones.",
    #     "categories": [],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
    # "density_standardized_colored_row_wise_selected": {
    #     "paths": ["results/density_standardized/"],
    #     "latex_path": "latex/density_experiments_standardized (selected).tex",
    #     "dataset_names": [dataset.name for dataset in DensityDatasets],
    #     "aggregation_funcs": ["mean", "std"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on density datasets (standardized). Row-wise Green of selected ones.",
    #     "categories": [],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
}

In [40]:
config_runtimes = {
    # Runtimes
    "real_world_colored_row_wise_time": {
        "paths": ["results/real_world_standardized/"],
        "latex_path": "latex/real_world_experiments_standardized_time.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["time"],
        "caption": "Total time runtime on real world datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
    "real_world_colored_row_wise_process_time": {
        "paths": ["results/real_world_standardized/"],
        "latex_path": "latex/real_world_experiments_standardized_process_time.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["process_time"],
        "caption": "Total process runtime on real world datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
    "density_colored_row_wise_time": {
        "paths": ["results/density_standardized/"],
        "latex_path": "latex/density_experiments_standardized_time.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["time"],
        "caption": "Total time runtime on density datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
    "density_colored_row_wise_process_time": {
        "paths": ["results/density_standardized/"],
        "latex_path": "latex/density_experiments_standardized_process_time.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["process_time"],
        "caption": "Total process runtime on density datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
}

In [41]:
from src.utils.cluster_algorithms import CLUSTER_ALGORITHMS, CLUSTER_ABBREV

config_clusterings = {
    # Clusterings
    "density_standardized_colored_row_wise_clusterings": {
        "paths": ["clustering_results2/density_standardized/"],
        "latex_path": "latex/density_standardized_clusterings.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": list(CLUSTER_ALGORITHMS.keys()),
        "metric_abbrev": CLUSTER_ABBREV,
        # "lower_is_better": METRICS,
        "selection": ["value"],
        "caption": "DISCO values on different Clusterings",
        "categories": [],
        "latex_coloring_axis": None,
        # "latex_coloring_selection": RUNTIME_METRICS,
    },
    # ARI
    "density_standardized_colored_row_wise_clusterings_ari": {
        "paths": ["results/ari/density_standardized/"],
        "latex_path": "latex/density_standardized_clusterings_ari.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": list(CLUSTER_ALGORITHMS.keys()),
        "metric_abbrev": CLUSTER_ABBREV,
        # "lower_is_better": METRICS,
        "selection": ["value"],
        "caption": "ARI values on different Clusterings",
        "categories": [],
        "latex_coloring_axis": None,
        # "latex_coloring_selection": RUNTIME_METRICS,
    },
}

In [42]:
dataset_names = [
    RealWorldDatasets.htru2,
    # RealWorldDatasets.Pendigits,
    # RealWorldDatasets.Mice,
    # RealWorldDatasets.letterrec,
    RealWorldDatasets.HAR,
    # RealWorldDatasets.cmu_faces,
    # RealWorldDatasets.Optdigits,
    # RealWorldDatasets.USPS,
    RealWorldDatasets.MNIST,
    # RealWorldDatasets.KMNIST,
    # RealWorldDatasets.FMNIST,
    DensityDatasets.smile1,
    DensityDatasets.dartboard1,
    DensityDatasets.chainlink,
    DensityDatasets.three_spiral,
    DensityDatasets.complex8,
    DensityDatasets.complex9,
    DensityDatasets.compound,
    DensityDatasets.aggregation,
    DensityDatasets.cluto_t8_8k,
    DensityDatasets.cluto_t7_10k,
    DensityDatasets.cluto_t4_8k,
    DensityDatasets.diamond9,
    # DensityDatasets.cluto_t5_8k,
    RealWorldDatasets.Synth_high,
    RealWorldDatasets.Synth_low,
    RealWorldDatasets.COIL20,
    # RealWorldDatasets.COIL100,
    RealWorldDatasets.Weizmann,
    RealWorldDatasets.Keck,
]

dataset_names = [dataset.name for dataset in dataset_names]

config_final = {
    "all_experiments_standardized": {
        "paths": ["results/real_world_standardized/", "results/density_standardized/"],
        "latex_path": "latex/all_experiments_standardized.tex",
        "dataset_names": dataset_names,
        "aggregation_funcs": ["mean"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on several datasets (standardized). Column-wise Green.",
        # "categories": [
        #     ("htru2", 3, "Tabular data"),
        #     # ("cmu_faces", 6, "Image data"),
        #     ("smile1", 12, "Tomas Barton Benchmark"),
        #     ("Synth_high", 6, "High-dimensional"),
        # ],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
}

In [43]:
from src.utils.latex_pandas import generate_latex_file
from mpire.pool import WorkerPool

pool = WorkerPool(n_jobs=30, use_dill=True)
# pool.map_unordered(generate_latex_file, configs.values())
# pool.map_unordered(generate_latex_file, config_runtimes.values())
pool.map_unordered(generate_latex_file, config_clusterings.values())
pool.map_unordered(generate_latex_file, config_final.values())
pool.stop_and_join()
pool.terminate()

Generated: `latex/density_standardized_clusterings_ari.tex`
Generated: `latex/density_standardized_clusterings.tex`
Generated: `latex/all_experiments_standardized.tex`


In [160]:
from src.utils.latex_pandas import generate_latex_file

generate_latex_file(**config_final["all_experiments_standardized"])

Generated: `latex/all_experiments_standardized.tex`


In [44]:
from src.utils.latex_pandas import gather_and_aggregate_data

df1 = gather_and_aggregate_data(["results/ari/density_standardized/"], ["value"], aggregation_funcs=["mean"])
df2 = gather_and_aggregate_data(["clustering_results2/density_standardized/"], ["value"], aggregation_funcs=["mean"])

In [45]:
df1 = df1.drop(["cluto-t4-8k", "cluto-t5-8k", "cluto-t7-10k", "cluto-t8-8k"])
df1.columns = df1.columns.get_level_values(0)
df1.index = df1.index.get_level_values(0)
df1 = df1.reindex(columns=df1.columns.reindex(list(CLUSTER_ALGORITHMS.keys()))[0])
df2 = df2.drop(["cluto-t4-8k", "cluto-t5-8k", "cluto-t7-10k", "cluto-t8-8k"])
df2.columns = df2.columns.get_level_values(0)
df2.index = df2.index.get_level_values(0)
df2 = df2.reindex(columns=df2.columns.reindex(list(CLUSTER_ALGORITHMS.keys()))[0])

In [46]:
from scipy.stats import pearsonr
import numpy as np

print(round(pearsonr(df1.to_numpy(), df2.to_numpy(), axis=None)[0],2))
print(np.round(np.array(list(pearsonr(df1.to_numpy(), df2.to_numpy(), axis=0))[0]), 2))
print(np.round(np.array(list(pearsonr(df1.to_numpy(), df2.to_numpy(), axis=1))[0]), 2))

0.69
[ nan 0.48 0.25 0.61 0.6  0.75 0.96 0.71]
[ 1.   -0.13  0.98  0.89  0.35  0.6   0.92  0.95  0.88]


  print(np.round(np.array(list(pearsonr(df1.to_numpy(), df2.to_numpy(), axis=0))[0]), 2))


In [132]:
df1

Unnamed: 0,GroundTruth,DBSCAN,HDBSCAN,DPC,SpectralClustering,Agglomerative,MeanShift,KMeans
3-spiral,1.0,1.0,0.942273,1.0,-0.004251,-0.003836,0.0,-0.005799
aggregation,1.0,0.974505,0.808943,0.794592,0.823621,0.701754,0.628476,0.714612
chainlink,1.0,1.0,1.0,0.201054,0.503609,-0.001,0.0,0.06409
complex8,1.0,0.939442,0.872419,0.239948,0.437614,0.511867,0.230167,0.465176
complex9,1.0,0.999518,0.261933,0.072852,0.361618,0.327539,0.182629,0.360767
compound,1.0,0.57387,0.811288,0.740208,0.561191,0.501062,0.722306,0.52715
dartboard1,1.0,1.0,1.0,0.009188,-0.002954,0.041462,0.0,-0.002991
diamond9,1.0,0.875371,0.743204,0.127495,0.964994,0.996266,0.147651,0.961846
smile1,1.0,1.0,1.0,0.332665,0.636015,0.568858,0.330534,0.545919


In [None]:
[ 1.   -0.13  0.98  0.89  0.35  0.6   0.92  0.95  0.88]