In [3]:
%load_ext autoreload
%autoreload 2

import os
import sys

%store -r DISCO_ROOT_FOLDER
if "DISCO_ROOT_FOLDER" in globals():
    os.chdir(DISCO_ROOT_FOLDER)
    sys.path.append(DISCO_ROOT_FOLDER)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from datasets.real_world_datasets import Datasets as RealWorldDatasets
from datasets.density_datasets import Datasets as DensityDatasets
# from src.utils.metrics import METRICS, SELECTED_METRICS
from src.utils.metrics import METRIC_ABBREV_PLAIN

METRICS = [
    "DISCO",
    # "DC_DUNN",
    ### Competitors
    "DBCV",
    # "DBCV_eucl",
    "DCSI",
    "LCCV",
    "VIASCKDE",
    "CVDD",
    "CDBW",
    "CVNN",
    # "DSI",
    ### Gauss
    "SILHOUETTE",
    "S_DBW",
    # "DUNN",
    # "DB",
    # "CH",
]

RUNTIME_METRICS = METRICS.copy()
RUNTIME_METRICS.remove("CVDD")

configs = {
    # Real World Datasets
    "real_world_colored_column_wise": {
        "paths": ["results/real_world/"],
        "latex_path": "latex/real_world_experiments.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on real-world datasets. Column-wise Green.",
        "categories": [
            ("Synth_low", 8, "Tabular data"),
            ("Weizmann", 2, "Video"),
            ("COIL20", 3, "Image"),
            ("Optdigits", 5, "MNIST"),
        ],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    "real_world_standardized_colored_column_wise": {
        "paths": ["results/real_world_standardized/"],
        "latex_path": "latex/real_world_experiments_standardized.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on real-world datasets (standardized). Column-wise Green.",
        "categories": [
            ("Synth_low", 8, "Tabular data"),
            ("Weizmann", 2, "Video"),
            ("COIL20", 3, "Image"),
            ("Optdigits", 5, "MNIST"),
        ],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    # "real_world_colored_row_wise_selected": {
    #     "paths": ["results/real_world/"],
    #     "latex_path": "latex/real_world_experiments (selected).tex",
    #     "dataset_names": [dataset.name for dataset in RealWorldDatasets],
    #     "aggregation_funcs": ["mean"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on real-world datasets. Row-wise Green of selected ones.",
    #     "categories": [
    #         ("Synth_low", 8, "Tabular data"),
    #         ("Weizmann", 2, "Video"),
    #         ("COIL20", 3, "Image"),
    #         ("Optdigits", 5, "MNIST"),
    #     ],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
    # "real_world_standardized_colored_row_wise_selected": {
    #     "paths": ["results/real_world_standardized/"],
    #     "latex_path": "latex/real_world_experiments_standardized (selected).tex",
    #     "dataset_names": [dataset.name for dataset in RealWorldDatasets],
    #     "aggregation_funcs": ["mean"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on real-world datasets (standardized). Row-wise Green of selected ones.",
    #     "categories": [
    #         ("Synth_low", 8, "Tabular data"),
    #         ("Weizmann", 2, "Video"),
    #         ("COIL20", 3, "Image"),
    #         ("Optdigits", 5, "MNIST"),
    #     ],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
    # Density Datasets
    "density_colored_column_wise": {
        "paths": ["results/density/"],
        "latex_path": "latex/density_experiments.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on density datasets. Column-wise Green.",
        "categories": [],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    "density_standardized_colored_column_wise": {
        "paths": ["results/density_standardized/"],
        "latex_path": "latex/density_experiments_standardized.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on density datasets (standardized). Column-wise Green.",
        "categories": [],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
    # "density_colored_row_wise_selected": {
    #     "paths": ["results/density/"],
    #     "latex_path": "latex/density_experiments (selected).tex",
    #     "dataset_names": [dataset.name for dataset in DensityDatasets],
    #     "aggregation_funcs": ["mean", "std"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on density datasets. Row-wise Green of selected ones.",
    #     "categories": [],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
    # "density_standardized_colored_row_wise_selected": {
    #     "paths": ["results/density_standardized/"],
    #     "latex_path": "latex/density_experiments_standardized (selected).tex",
    #     "dataset_names": [dataset.name for dataset in DensityDatasets],
    #     "aggregation_funcs": ["mean", "std"],
    #     "metrics": METRICS,
    #     "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
    #     "selection": ["value"],
    #     "caption": "Evaluating on density datasets (standardized). Row-wise Green of selected ones.",
    #     "categories": [],
    #     "latex_coloring_axis": 1,
    #     "latex_coloring_selection": ["DISCO", "DBCV", "DCSI", "S_DBW", "DSI", "SILHOUETTE", "DUNN"],
    # },
}

In [5]:
config_runtimes = {
    # Runtimes
    "real_world_colored_row_wise_time": {
        "paths": ["results/real_world_standardized/"],
        "latex_path": "latex/real_world_experiments_standardized_time.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["time"],
        "caption": "Total time runtime on real world datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
    "real_world_colored_row_wise_process_time": {
        "paths": ["results/real_world_standardized/"],
        "latex_path": "latex/real_world_experiments_standardized_process_time.tex",
        "dataset_names": [dataset.name for dataset in RealWorldDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["process_time"],
        "caption": "Total process runtime on real world datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
    "density_colored_row_wise_time": {
        "paths": ["results/density_standardized/"],
        "latex_path": "latex/density_experiments_standardized_time.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["time"],
        "caption": "Total time runtime on density datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
    "density_colored_row_wise_process_time": {
        "paths": ["results/density_standardized/"],
        "latex_path": "latex/density_experiments_standardized_process_time.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean", "std"],
        "metrics": METRICS,
        "metric_abbrev": METRIC_ABBREV_PLAIN,
        "lower_is_better": METRICS,
        "selection": ["process_time"],
        "caption": "Total process runtime on density datasets. (coloring excluded CVDD)",
        "categories": [],
        "latex_coloring_axis": 1,
        "latex_coloring_selection": RUNTIME_METRICS,
    },
}

In [6]:
from src.utils.cluster_algorithms import CLUSTER_ALGORITHMS, CLUSTER_ABBREV

config_clusterings = {
    # Clusterings
    "density_standardized_colored_row_wise_clusterings": {
        "paths": ["clustering_results2/density_standardized/"],
        "latex_path": "latex/density_standardized_clusterings.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": list(CLUSTER_ALGORITHMS.keys()),
        "metric_abbrev": CLUSTER_ABBREV,
        # "lower_is_better": METRICS,
        "selection": ["value"],
        "caption": "DISCO values on different Clusterings",
        "categories": [],
        "latex_coloring_axis": None,
        # "latex_coloring_selection": RUNTIME_METRICS,
    },
    # ARI
    "density_standardized_colored_row_wise_clusterings_ari": {
        "paths": ["results/ari/density_standardized/"],
        "latex_path": "latex/density_standardized_clusterings_ari.tex",
        "dataset_names": [dataset.name for dataset in DensityDatasets],
        "aggregation_funcs": ["mean"],
        "metrics": list(CLUSTER_ALGORITHMS.keys()),
        "metric_abbrev": CLUSTER_ABBREV,
        # "lower_is_better": METRICS,
        "selection": ["value"],
        "caption": "ARI values on different Clusterings",
        "categories": [],
        "latex_coloring_axis": None,
        # "latex_coloring_selection": RUNTIME_METRICS,
    },
}

In [7]:
dataset_names = [
    RealWorldDatasets.htru2,
    # RealWorldDatasets.Pendigits,
    # RealWorldDatasets.Mice,
    # RealWorldDatasets.letterrec,
    RealWorldDatasets.HAR,
    # RealWorldDatasets.cmu_faces,
    # RealWorldDatasets.Optdigits,
    # RealWorldDatasets.USPS,
    RealWorldDatasets.MNIST,
    # RealWorldDatasets.KMNIST,
    # RealWorldDatasets.FMNIST,
    DensityDatasets.smile1,
    DensityDatasets.dartboard1,
    DensityDatasets.chainlink,
    DensityDatasets.three_spiral,
    DensityDatasets.complex8,
    DensityDatasets.complex9,
    DensityDatasets.compound,
    DensityDatasets.aggregation,
    DensityDatasets.cluto_t8_8k,
    DensityDatasets.cluto_t7_10k,
    DensityDatasets.cluto_t4_8k,
    DensityDatasets.diamond9,
    # DensityDatasets.cluto_t5_8k,
    RealWorldDatasets.Synth_high,
    RealWorldDatasets.Synth_low,
    RealWorldDatasets.COIL20,
    # RealWorldDatasets.COIL100,
    RealWorldDatasets.Weizmann,
    RealWorldDatasets.Keck,
]

dataset_names = [dataset.name for dataset in dataset_names]

config_final = {
    "all_experiments_standardized": {
        "paths": ["results/real_world_standardized/", "results/density_standardized/"],
        "latex_path": "latex/all_experiments_standardized.tex",
        "dataset_names": dataset_names,
        "aggregation_funcs": ["mean"],
        "metrics": METRICS,
        "lower_is_better": ["CVNN", "DCVI", "S_DBW"],
        "selection": ["value"],
        "caption": "Evaluating on several datasets (standardized). Column-wise Green.",
        # "categories": [
        #     ("htru2", 3, "Tabular data"),
        #     # ("cmu_faces", 6, "Image data"),
        #     ("smile1", 12, "Tomas Barton Benchmark"),
        #     ("Synth_high", 6, "High-dimensional"),
        # ],
        "latex_coloring_axis": 0,
        "latex_coloring_selection": None,
    },
}

In [43]:
from src.utils.latex_pandas import generate_latex_file
from mpire.pool import WorkerPool

pool = WorkerPool(n_jobs=30, use_dill=True)
# pool.map_unordered(generate_latex_file, configs.values())
# pool.map_unordered(generate_latex_file, config_runtimes.values())
pool.map_unordered(generate_latex_file, config_clusterings.values())
pool.map_unordered(generate_latex_file, config_final.values())
pool.stop_and_join()
pool.terminate()

Generated: `latex/density_standardized_clusterings_ari.tex`
Generated: `latex/density_standardized_clusterings.tex`
Generated: `latex/all_experiments_standardized.tex`


In [160]:
from src.utils.latex_pandas import generate_latex_file

generate_latex_file(**config_final["all_experiments_standardized"])

Generated: `latex/all_experiments_standardized.tex`


In [102]:
from src.utils.latex_pandas import gather_and_aggregate_data

df_ari = gather_and_aggregate_data(["results/ari/density_standardized/"], ["value"], aggregation_funcs=["mean"])
df_ari = df_ari.drop(["cluto-t4-8k", "cluto-t5-8k", "cluto-t7-10k", "cluto-t8-8k"])
df_ari.columns = df_ari.columns.get_level_values(0)
df_ari.index = df_ari.index.get_level_values(0)
df_ari = df_ari.reindex(columns=df_ari.columns.reindex(list(CLUSTER_ALGORITHMS.keys()))[0])
df_ari

Unnamed: 0,GroundTruth,DBSCAN,HDBSCAN,DPC,SpectralClustering,Agglomerative,KMeans,Random_k,Random_100
3-spiral,1.0,1.0,0.942517,1.0,-0.004251,-0.003836,-0.005799,,
aggregation,1.0,0.974067,0.808943,0.787216,0.823621,0.701754,0.714612,,
chainlink,1.0,1.0,1.0,0.201572,0.503609,-0.001,0.06409,,
complex8,1.0,0.939437,0.870157,0.311191,0.437614,0.511867,0.465176,,
complex9,1.0,0.999518,0.248785,0.198642,0.361618,0.327539,0.360767,,
compound,1.0,0.610912,0.811086,0.740208,0.561191,0.501062,0.52715,,
dartboard1,1.0,1.0,1.0,0.008476,-0.002954,0.041462,-0.002991,,
diamond9,1.0,0.875442,0.769478,0.151082,0.964994,0.996266,0.961846,,
smile1,1.0,1.0,1.0,0.332665,0.636015,0.568858,0.545919,,


In [103]:
df_disco = gather_and_aggregate_data(["clustering_results2/density_standardized/"], ["value"], aggregation_funcs=["mean"])
df_disco = df_disco.drop(["cluto-t4-8k", "cluto-t5-8k", "cluto-t7-10k", "cluto-t8-8k"])
df_disco.columns = df_disco.columns.get_level_values(0)
df_disco.index = df_disco.index.get_level_values(0)
df_disco = df_disco.reindex(columns=df_disco.columns.reindex(list(CLUSTER_ALGORITHMS.keys()))[0])
df_disco

Unnamed: 0,GroundTruth,DBSCAN,HDBSCAN,DPC,SpectralClustering,Agglomerative,KMeans,Random_k,Random_100
3-spiral,0.587948,0.587948,0.489028,0.587948,-0.002035,0.002716,-0.00171,,
aggregation,0.305651,0.29802,0.669705,0.498282,0.277962,0.214675,0.187767,,
chainlink,0.835047,0.835047,0.835047,-0.012833,0.479781,0.005968,0.077678,,
complex8,0.38996,0.391872,0.365744,-0.098652,0.040382,0.069898,0.037661,,
complex9,0.357815,0.357788,0.640266,-0.200613,-0.005967,0.001961,0.046401,,
compound,0.347585,0.321406,0.446946,0.591612,0.164638,0.10248,0.138581,,
dartboard1,0.874343,0.874343,0.874343,0.479126,-0.006421,-0.069412,-0.004718,,
diamond9,0.217659,0.208595,0.166034,-0.081264,0.20882,0.217166,0.20352,,
smile1,0.900074,0.900074,0.900074,0.525588,0.431486,0.368517,0.335319,,


In [None]:
from scipy.stats import pearsonr
import numpy as np

print(round(pearsonr(df_ari.to_numpy(), df_disco.to_numpy(), axis=None)[0],2))
# 0.69

print(np.round(np.array(list(pearsonr(df1.to_numpy(), df2.to_numpy(), axis=0))[0]), 2))
# [ nan 0.48 0.25 0.61 0.6  0.75 0.96 0.71]

print(np.round(np.array(list(pearsonr(df_ari.to_numpy(), df_disco.to_numpy(), axis=1))[0]), 2))
# [ 1.   -0.13  0.98  0.89  0.35  0.6   0.92  0.95  0.88]

0.69
[ nan 0.48 0.25 0.61 0.6  0.75 0.96 0.71]
[ 1.   -0.13  0.98  0.89  0.35  0.6   0.92  0.95  0.88]


  print(np.round(np.array(list(pearsonr(df1.to_numpy(), df2.to_numpy(), axis=0))[0]), 2))


In [10]:
import numpy as np
import pandas as pd
import glob
import re
from ast import literal_eval

from os.path import exists
from collections import defaultdict

from sklearn.metrics import (
    normalized_mutual_info_score as nmi,
    adjusted_rand_score as ari,
)

CLUSTERINGS_PATH = "/export/share/pascalw777dm/DISCO/clusterings/*/"

from datasets.density_datasets import Datasets as DensityDatasets
from datasets.real_world_datasets import Datasets as RealWorldDatasets
from src.utils.cluster_algorithms import CLUSTER_ALGORITHMS

Datasets = [dataset for datasets in [DensityDatasets, RealWorldDatasets] for dataset in datasets]
Clusterers = CLUSTER_ALGORITHMS.keys()

def load_clustering(dataset, path, run):
    X, l = dataset.standardized_data_cached

    np.random.seed(0)
    seeds = np.random.choice(10_000, size=run + 1, replace=False)
    np.random.seed(seeds[-1])
    shuffle_data_index = np.random.choice(len(X), size=len(X), replace=False)
    l = l[shuffle_data_index]

    df = pd.read_csv(path)
    l_ = df["value"][0]
    l_ = np.array(literal_eval(",".join(l_.split()).replace("[,", "[")))

    return l, l_


data_ari = []

for dataset in Datasets:
    print(dataset.name + ": ", end="")

    for clusterer in Clusterers:
        print(clusterer, end=", ")
        file_paths = glob.glob(CLUSTERINGS_PATH + f"{dataset.id}/{clusterer}_*.csv")

        for path in file_paths:
            run = re.search(r".*_(\d+).csv", path)
            if not run:
                continue
            run = int(run.group(1))

            l, l_ = load_clustering(dataset, path, run)

            n_clust = len(set(l_[l_ >= 0]))
            ind = np.where(l_ == -1)[0]
            for i in ind:
                l_[i] = n_clust
                n_clust += 1
            try:
                nmi_val = nmi(l, l_)
                ari_val = ari(l, l_)
            except:
                print(f"Error: {dataset.name=}, {clusterer=}, {path=}, {run=}")

            data_ari.append((dataset.id, clusterer, run, nmi_val, ari_val))
    print()

3-spiral: GroundTruth, 

DBSCAN, HDBSCAN, DPC, SpectralClustering, Agglomerative, MeanShift, KMeans, Random_k, Random_100, 
aggregation: GroundTruth, DBSCAN, HDBSCAN, DPC, SpectralClustering, Agglomerative, MeanShift, KMeans, Random_k, Random_100, 
chainlink: GroundTruth, DBSCAN, HDBSCAN, DPC, SpectralClustering, Agglomerative, MeanShift, KMeans, Random_k, Random_100, 
cluto-t4-8k: GroundTruth, DBSCAN, HDBSCAN, DPC, SpectralClustering, Agglomerative, MeanShift, KMeans, Random_k, Random_100, 
cluto-t5-8k: GroundTruth, DBSCAN, HDBSCAN, DPC, SpectralClustering, Agglomerative, MeanShift, KMeans, Random_k, Random_100, 
cluto-t7-10k: GroundTruth, DBSCAN, HDBSCAN, DPC, SpectralClustering, Agglomerative, MeanShift, KMeans, Random_k, Random_100, 
cluto-t8-8k: GroundTruth, DBSCAN, HDBSCAN, DPC, SpectralClustering, Agglomerative, MeanShift, KMeans, Random_k, Random_100, 
complex8: GroundTruth, DBSCAN, HDBSCAN, DPC, SpectralClustering, Agglomerative, MeanShift, KMeans, Random_k, Random_100, 
complex9: GroundTruth, DBSCAN,

In [11]:
# data_ari

In [149]:
from src.utils.cluster_algorithms import CLUSTER_ALGORITHMS

del CLUSTER_ALGORITHMS["DPC"]
del CLUSTER_ALGORITHMS["MeanShift"]
del CLUSTER_ALGORITHMS["SpectralClustering"]

KeyError: 'DPC'

In [150]:
clustering_algorithms = list(CLUSTER_ALGORITHMS.keys())

df_ari = pd.DataFrame(data_ari, columns=["dataset", "clusterer", "run", "nmi", "ari"])
# df.dataset = df.dataset.map(lambda x: id_to_name[x])
df_ari = pd.pivot_table(df_ari, values="ari", index=["dataset", "run"], columns=["clusterer"], dropna=False)
df_ari = df_ari.reindex(columns=df_ari.columns.reindex(clustering_algorithms)[0])
# df_ari = df_ari.reindex(index=pd.MultiIndex.from_product([df_ari.index.levels[0], df_ari.index]))
df_ari

Unnamed: 0_level_0,clusterer,GroundTruth,DBSCAN,HDBSCAN,Agglomerative,KMeans,Random_k,Random_100
dataset,run,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
COIL20,0,1.0,0.653905,0.773709,0.685521,0.551159,0.001243,0.001258
COIL20,1,,,,,0.565794,0.000170,
COIL20,2,,,,,0.578947,-0.001213,
COIL20,3,,,,,0.586838,-0.002198,
COIL20,4,,,,,0.572283,0.001710,
...,...,...,...,...,...,...,...,...
three_spiral,5,1.0,1.000000,0.888605,-0.003836,-0.005681,-0.001306,0.002578
three_spiral,6,1.0,1.000000,0.985556,-0.003836,-0.005877,-0.002031,0.000192
three_spiral,7,1.0,1.000000,0.939044,-0.003836,-0.006030,-0.002940,-0.001445
three_spiral,8,1.0,1.000000,0.985556,-0.003836,-0.005681,-0.000721,0.001145


In [None]:
# from src.utils.latex_pandas import gather_and_aggregate_data

# df_ari = gather_and_aggregate_data(["results/ari/density_standardized/"], ["value"], aggregation_funcs=["mean"])
# df_ari = df_ari.drop(["cluto-t4-8k", "cluto-t5-8k", "cluto-t7-10k", "cluto-t8-8k"])
# df_ari.columns = df_ari.columns.get_level_values(0)
# df_ari.index = df_ari.index.get_level_values(0)
# df_ari = df_ari.reindex(columns=df_ari.columns.reindex(list(CLUSTER_ALGORITHMS.keys()))[0])
# df_ari

In [151]:
import pandas as pd
import glob
import numpy as np

path = "clusterings_metrics/*/"

def extract_dataset_clusterer_cvi(file_path):
    file_path = file_path.split("/")
    dataset = file_path[-2]
    [clusterer_run, cvi] = file_path[-1].split("##")
    clusterer_run = clusterer_run.split("_")
    clusterer = "_".join(clusterer_run[:-1])
    run = clusterer_run[-1]
    cvi = cvi.split(".")[0]
    return (dataset, clusterer, cvi, int(run))

data = [
    extract_dataset_clusterer_cvi(file_path) + (float(np.loadtxt(file_path)),)
    for file_path in glob.glob(f"{path}*/*")
]

In [152]:
from datasets.density_datasets import Datasets as DensityDatasets

clustering_algorithms = list(CLUSTER_ALGORITHMS.keys())
# clustering_algorithms.remove("MeanShift")  # contains nan values

df = pd.DataFrame(data, columns=["dataset", "clusterer", "cvi", "run", "value"])
df_pivot = pd.pivot_table(df, values="value", index=["cvi", "dataset", "run"], columns=["clusterer"], dropna=False)
df_pivot = df_pivot.reindex(columns=df_pivot.columns.reindex(clustering_algorithms)[0])
# df_pivot = df_pivot.reindex(index=pd.MultiIndex.from_product([df_pivot.index.levels[0], df_ari.index]))
# df_pivot

In [153]:
df_pivot.index.get_level_values(1).unique()

Index(['COIL20', 'HAR', 'Mice', 'Optdigits', 'Pendigits', 'Synth_high',
       'Synth_low', 'USPS', 'aggregation', 'chainlink', 'cluto_t4_8k',
       'cluto_t5_8k', 'cluto_t7_10k', 'cluto_t8_8k', 'cmu_faces', 'complex8',
       'complex9', 'compound', 'dartboard1', 'diamond9', 'htru2', 'letterrec',
       'smile1', 'three_spiral'],
      dtype='object', name='dataset')

In [154]:
df_pivot.index.get_level_values(0).unique()

Index(['CDBW', 'CVDD', 'CVNN', 'DBCV', 'DCSI', 'DISCO', 'LCCV', 'SILHOUETTE',
       'S_DBW', 'VIASCKDE'],
      dtype='object', name='cvi')

In [155]:
df_pivot.loc[(df_pivot.index.get_level_values(0) == "SILHOUETTE") & (df_pivot.index.get_level_values(1) == "htru2")]

Unnamed: 0_level_0,Unnamed: 1_level_0,clusterer,GroundTruth,DBSCAN,HDBSCAN,Agglomerative,KMeans,Random_k,Random_100
cvi,dataset,run,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SILHOUETTE,htru2,0,0.549011,0.451148,0.427159,0.579897,0.632623,-2.4e-05,-0.104848
SILHOUETTE,htru2,1,,,,,,,
SILHOUETTE,htru2,2,,,,,,,
SILHOUETTE,htru2,3,,,,,,,
SILHOUETTE,htru2,4,,,,,,,
SILHOUETTE,htru2,5,,,,,,,
SILHOUETTE,htru2,6,,,,,,,
SILHOUETTE,htru2,7,,,,,,,
SILHOUETTE,htru2,8,,,,,,,
SILHOUETTE,htru2,9,,,,,,,


In [156]:
from scipy.stats import pearsonr

def get_pearsonr_without_nans_row(ari_row, cvi_row):
    ari_nans = np.isnan(ari_row)
    ari_row = ari_row[~ari_nans]
    cvi_row = cvi_row[~ari_nans]
    n_ari = len(ari_row)
    cvi_nans = np.isnan(cvi_row)
    n_cvi = len(cvi_row[~cvi_nans])
    # print(n_ari, n_cvi, ari_row, cvi_row)

    pearson_value = np.nan
    pearson_scaled = np.nan
    if n_cvi >= 2:
        pearson_value = pearsonr(ari_row[~cvi_nans], cvi_row[~cvi_nans])[0]
        pearson_scaled = (n_cvi / n_ari) * pearson_value

    if np.isnan(pearson_value):
        return np.nan, np.nan, np.nan
    else:
        return (pearson_scaled, pearson_value, n_cvi / n_ari)


def get_pearsonr_without_nans_rowwise(ari_matrix, cvi_matrix):
    return [get_pearsonr_without_nans_row(ari_row, cvi_row) for (ari_row, cvi_row) in zip(ari_matrix, cvi_matrix)]

In [157]:
CVIs = set(df_pivot.index.get_level_values(0))
df_ari = df_ari.reindex(columns=df_ari.columns.reindex(clustering_algorithms)[0])
np_ari_matrix = df_ari.to_numpy()

pearson_values = []
for cvi in CVIs:
    df_cvi = df_pivot.xs(cvi)
    np_cvi_matrix = df_cvi.to_numpy()
    pearson_per_dataset = get_pearsonr_without_nans_rowwise(np_ari_matrix, np_cvi_matrix)
    pearson_values += [
        (cvi,) + x
        for x in map(lambda x: (x[0][0], x[0][1], x[1][0], x[1][1], x[1][2]), zip(df_cvi.index, pearson_per_dataset))
    ]

  pearson_value = pearsonr(ari_row[~cvi_nans], cvi_row[~cvi_nans])[0]


In [158]:
# df_pearson[(df_pearson.cvi == "DISCO") & (df_pearson.dataset == "compound")]

In [159]:
from src.utils.metrics import SELECTED_METRICS

dataset_names = {dataset.id: dataset.name for dataset in Datasets}
for dataset in ["cluto_t5_8k", "HAR", "letterrec", "USPS", "Mice"]:
    del dataset_names[dataset]


def reorder_rows(df_reorder, first_column, second_column, dataset_dict):
    df_reorder[second_column] = second_column
    df_reorder = df_reorder.reset_index().set_index([first_column, second_column])
    df_reorder = df_reorder.reindex(index=dataset_dict, level=0)
    df_reorder = df_reorder.rename(index=dataset_dict, level=0)
    return df_reorder


def create_pearson_pivot(pearson_values, values="pearson_true", name="pearson", second_column="mean"):
    df_pearson = pd.DataFrame(
        pearson_values, columns=["cvi", "dataset", "run", "pearson_scaled", "pearson_true", "percent_nans"]
    )
    df_pearson_pivot = pd.pivot_table(df_pearson, values=values, index=["dataset"], columns=["cvi"])
    df_pearson_pivot = df_pearson_pivot.reindex(columns=df_pearson_pivot.columns.reindex(SELECTED_METRICS)[0])
    df_pearson_pivot.columns = pd.MultiIndex.from_tuples(
        [(x, name) for x in df_pearson_pivot.columns.reindex(SELECTED_METRICS)[0]]
    )
    df_pearson_pivot = reorder_rows(df_pearson_pivot, "dataset", second_column, dataset_names)
    return df_pearson_pivot

df_pearson_pivot = create_pearson_pivot(pearson_values)
df_nans = create_pearson_pivot(pearson_values, values="percent_nans", second_column="std")

df_pearson_merged = pd.concat((df_pearson_pivot, df_nans))
df_pearson_merged = df_pearson_merged.reindex(index=dataset_names.values(), level=0)
# df_pearson_merged

In [160]:
not_one_mask = df_pearson_merged.loc[df_pearson_merged.index.get_level_values(1) == 'std'] < 1.0
not_one_mask.index = pd.MultiIndex.from_tuples(
    [(dataset, 'mean') for dataset, _std in not_one_mask.index],
    names=not_one_mask.index.names
)
df_pearson_merged[not_one_mask] = np.nan
df_pearson_merged

Unnamed: 0_level_0,Unnamed: 1_level_0,DISCO,DBCV,DCSI,LCCV,VIASCKDE,CVDD,CDBW,CVNN,SILHOUETTE,S_DBW
Unnamed: 0_level_1,Unnamed: 1_level_1,pearson,pearson,pearson,pearson,pearson,pearson,pearson,pearson,pearson,pearson
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
3-spiral,mean,0.891599,,,0.858211,,,0.316021,,-0.024322,0.4238
3-spiral,std,1.0,0.857143,0.857143,1.0,0.857143,0.857143,1.0,0.857143,1.0,1.0
aggregation,mean,0.809514,,,0.924109,,,0.864769,,0.899516,-0.811281
aggregation,std,1.0,0.971429,0.985714,1.0,0.971429,0.971429,1.0,0.971429,1.0,1.0
chainlink,mean,0.927754,0.997091,0.722714,0.900733,0.518371,0.996722,0.769252,-0.365282,0.269905,0.265033
chainlink,std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
cluto-t4-8k,mean,0.44426,0.775642,0.938152,0.761248,0.570231,0.183738,,-0.426928,0.424031,-0.538439
cluto-t4-8k,std,1.0,1.0,1.0,1.0,1.0,1.0,0.857143,1.0,1.0,1.0
cluto-t7-10k,mean,0.486584,0.838712,0.88864,0.324223,0.503785,-0.014723,,-0.394239,0.134944,-0.531855
cluto-t7-10k,std,1.0,1.0,1.0,1.0,1.0,1.0,0.857143,1.0,1.0,1.0


In [179]:
from clustpy.utils.evaluation import evaluation_df_to_latex_table
from src.utils.latex_pandas import latex_coloring, regex_file, run_regex
from src.utils.metrics import METRIC_ABBREV_PLAIN, METRIC_ABBREV_TABLES

latex_pearson = "latex/pearson.tex"

evaluation_df_to_latex_table(
    df_pearson_merged,
    latex_pearson,
    best_in_bold=False,
    second_best_underlined=False,
    in_percent=True,
    decimal_places=2,
    use_std=False,
)

latex_coloring(latex_pearson, skiprows=6, axis=None, min_value=-100, max_value=100, lower_is_better=["CVNN", "S_DBW"], inverse_color=["CVNN", "S_DBW"])
regex_file(
    latex_pearson,
    "",
    metric_abbrev=METRIC_ABBREV_TABLES,
    # categories=[("3-spiral", 14, "Density-based Benchmark Data"), ("COIL20", 5, "Real World")],
    # categories=[("three_spiral", 13, "Density-based 2D-Data"), ("Synth_low", 7, "Tabular Data"), ("COIL20", 4, "Images")],
)
run_regex(
    [
        r"s/\$\s*nan\s*\$/-/g",
        # r"s/- /- & \$(0)\$ /g",
        # r"s/\\pm\s*(\d+).00?(\}?)\$/$2\$ & \$($1)\$/g",
        # r"s/(\\cellcolor\{.*?\})(.*?)&/$1$2& $1/g",
    ],
    latex_pearson,
)

# \\pm\s*(\d+).00?(\}?)\$   /   $2$ & $($1)$
# (\\cellcolor\{.*?\})(.*?)&   /   $1$2& $1

In [491]:
df_disco = df_pivot.xs("DISCO")
df_help = df_ari.mean(axis=1)
df_help[df_help.index.get_level_values(0)[2]]

df_disco2 = pd.DataFrame()
for i in df_help.index.get_level_values(0):
    df_disco2[i] = df_disco.loc[(i, df_help[i].idxmax())]
df_disco2 = df_disco2.T
df_disco2.columns = pd.MultiIndex.from_tuples([(x, "clusterer") for x in df_disco2.columns])
df_disco2.index.names = ["dataset"]
df_disco2 = reorder_rows(df_disco2, "dataset", "mean", dataset_names)

df_ari2 = pd.DataFrame()
for i in df_help.index.get_level_values(0):
    df_ari2[i] = df_ari.loc[(i, df_help[i].idxmax())]
df_ari2 = df_ari2.T
df_ari2.columns = pd.MultiIndex.from_tuples([(x, "clusterer") for x in df_ari2.columns])
df_ari2.index.names = ["dataset"]
df_ari2 = reorder_rows(df_ari2, "dataset", "mean", dataset_names)

In [492]:
from clustpy.utils.evaluation import evaluation_df_to_latex_table
from src.utils.latex_pandas import latex_coloring, regex_file, run_regex
from src.utils.metrics import METRIC_ABBREV_PLAIN

latex_pearson = "latex/df_disco2.tex"

evaluation_df_to_latex_table(
    df_disco2,
    latex_pearson,
    best_in_bold=False,
    second_best_underlined=False,
    in_percent=True,
    decimal_places=2,
)

latex_coloring(latex_pearson, skiprows=6, axis=None, min_value=-1, max_value=1)
regex_file(
    latex_pearson,
    "",
    metric_abbrev=METRIC_ABBREV_PLAIN,
    categories=[("3-spiral", 14, "Density-based Benchmark Data")],
    # categories=[("three_spiral", 13, "Density-based 2D-Data"), ("Synth_low", 7, "Tabular Data"), ("COIL20", 4, "Images")],
)


latex_pearson = "latex/df_ari2.tex"

evaluation_df_to_latex_table(
    df_ari2,
    latex_pearson,
    best_in_bold=False,
    second_best_underlined=False,
    in_percent=True,
    decimal_places=2,
)

latex_coloring(latex_pearson, skiprows=6, axis=None, min_value=-1, max_value=1)
regex_file(
    latex_pearson,
    "",
    metric_abbrev=METRIC_ABBREV_PLAIN,
    categories=[("3-spiral", 14, "Density-based Benchmark Data")],
    # categories=[("three_spiral", 13, "Density-based 2D-Data"), ("Synth_low", 7, "Tabular Data"), ("COIL20", 4, "Images")],
)

In [498]:
p_values = np.round(100 * np.array(list(pearsonr(df_ari2.to_numpy(), df_disco2.to_numpy(), axis=1))[0]), 2)
np.round(p_values / 100 * 65 + 5, 0).astype(int)

array([64, 55, 65, 33, 33, 49, 64, 54, 59, 62, 68, 64, 64, 43, 29, 34, 66,
       12, 56])

In [3]:
import sys
import os

import numpy as np
import pandas as pd

DISCO_ROOT_PATH = "/export/share/pascalw777dm/DISCO"
sys.path.append(DISCO_ROOT_PATH)
os.environ["TZ"] = "Europe/Vienna"

from ast import literal_eval

from src.utils.metrics import METRICS
from datasets.density_datasets import Datasets as DensityDatasets
from datasets.real_world_datasets import Datasets as RealWorldDatasets

from src.utils.cluster_algorithms import CLUSTER_ALGORITHMS


n_jobs = 50
task_timeout = 12 * 60 * 60  # 12 hours

# DATASETs = DensityDatasets
# DATASET_PATH = "density_standardized"
# RUNS = 10

DATASETs = RealWorldDatasets.get_experiments_list()
DATASET_PATH = "real_world_standardized"
RUNS = 1

for dataset in DATASETs:
    for clusterer in CLUSTER_ALGORITHMS.keys():
        for run in range(RUNS):
            for metric_name, metric_func in METRICS.items():
                metric_save_path = f"{DISCO_ROOT_PATH}/clusterings_metrics/{DATASET_PATH}/{dataset.id}/{clusterer}_{run}##{metric_name}.csv"
                if os.path.exists(metric_save_path):
                    value = np.loadtxt(metric_save_path)
                    if np.isnan(value):
                        pass
                        # print(f"{dataset.id}/{clusterer}_{run}##{metric_name}")
                        # os.remove(metric_save_path)
                elif not os.path.exists(f"{DISCO_ROOT_PATH}/clusterings/{DATASET_PATH}/{dataset.id}/{clusterer}_{run}.csv"):
                    pass
                    # print(f"Clustering not found -- {dataset.name=}, {clusterer=}, {run=}, {metric_name=}")
                else:
                    pass
                    print(f"{dataset.id}/{clusterer}_{run}##{metric_name}")

In [24]:
clustering_path = f"{DISCO_ROOT_PATH}/clusterings/density_standardized/three_spiral/MeanShift_0.csv"
os.path.exists(clustering_path)

df = pd.read_csv(clustering_path)
clustering_labels = df["value"][0]
clustering_labels = np.array(literal_eval(",".join(clustering_labels.split()).replace("[,", "[")))

In [29]:
from clustpy.utils.plots import plot_with_transformation
from sklearn.metrics.cluster import adjusted_rand_score as ari

X, l = DensityDatasets.three_spiral.standardized_data_cached

ari(l, clustering_labels)

0.0

In [2]:
sample_cvi_path = f"{DISCO_ROOT_PATH}/clusterings_metrics/{DATASET_PATH}/letterrec/MeanShift_0##SILHOUETTE.csv"
os.path.exists(sample_cvi_path)

True