In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys

%store -r SHiP_ROOT_FOLDER
if "SHiP_ROOT_FOLDER" in globals():
    os.chdir(SHiP_ROOT_FOLDER)
    sys.path.append(SHiP_ROOT_FOLDER)

In [2]:
from validation_indices import NamedIndices

def cc(l, l_):
    return NamedIndices["CC"].score(l.astype(int).tolist(), l_.astype(int).tolist())

In [3]:
from datasets.example_datasets import Datasets as ExampleDatasets
from datasets.density_datasets import Datasets as DensityDatasets
from datasets.real_world_datasets import Datasets as RealWorldDatasets

Datasets = [dataset for datasets in [ExampleDatasets, DensityDatasets, RealWorldDatasets] for dataset in datasets]

In [None]:
import numpy as np
import pandas as pd
import glob
import re

from os.path import exists
from collections import defaultdict

from sklearn.metrics import (
    normalized_mutual_info_score as nmi,
    adjusted_rand_score as ari,
    adjusted_mutual_info_score as ami,
)


use_cache = False
paths = [
    "/export/share/##42h8##/HCF/TreeUsageImpl/oldpythonsetup/labels_competitors/*/",
    "/export/share/##42h8##/HCF_HST/TreeUsageImpl/oldpythonsetup/labels/*/"
]


data_ext = {}

for dataset in Datasets:
    print(dataset.name + ": ", end="")
    X, l = dataset.data_cached
    dataset = dataset.name

    data_ext[dataset, "mean"] = {}
    data_ext[dataset, "std"] = {}

    for path in paths:
    # for path in PATHS_NEW:
        file_paths = (glob.glob(path + f"{dataset}##*.npy"))

        algorithms = map(lambda path: (re.search(r".*##(.*)##(\d+).npy", path), path), file_paths)
        algorithms = [(algorithm.group(1), path) for (algorithm, path) in algorithms if algorithm]

        alg_dict = defaultdict(list)
        for alg_name, file_path in algorithms:
            alg_dict[alg_name].append(file_path)
        alg_dict = sorted(alg_dict.items())

        for alg_name, paths_ in alg_dict:
            print(alg_name + ", ", end="")
            NMIs = []
            ARIs = []
            AMIs = []
            CCs = []
            NOISEs = []
            n_clusters = []
            RUNTIMEs = []

            for file_path in paths_:
                runtime = int(np.load(file_path[:-4] + "##time.npy"))
                RUNTIMEs.append(runtime)
                l_ = np.load(file_path)

                n_clusters.append(len(set(l_[l_ >= 0])))
                NOISEs.append(len(np.where(l_ == -1)[0]) / len(l_))
                # NMIs.append(nmi(l[l_ != -1], l_[l_ != -1]))
                # ARIs.append(ari(l[l_ != -1], l_[l_ != -1]))

                nmi_ari_path = file_path.replace("/labels/", "/nmi_ari/")
                nmi_ari_path = nmi_ari_path.replace("/labels_competitors/", "/nmi_ari_competitors/")
                os.makedirs(os.path.dirname(nmi_ari_path), exist_ok=True)
                if use_cache and exists(nmi_ari_path):
                    nmi_val, ari_val, ami_val, cc_val = np.loadtxt(nmi_ari_path)
                else:
                    # Assign each noise point to its own cluster
                    n_clust = len(set(l_[l_ >= 0]))
                    ind = np.where(l_ == -1)[0]
                    for i in ind:
                        l_[i] = n_clust
                        n_clust += 1
                    try:
                        nmi_val = nmi(l, l_)
                        ari_val = ari(l, l_)
                        ami_val = ami(l, l_)
                        cc_val = cc(l, l_)
                    except:
                        f = open("JupyterNotebooks/error.txt", "a")
                        f.write(file_path + "\n")
                        f.close()
                    np.savetxt(nmi_ari_path, (nmi_val, ari_val, ami_val, cc_val))

                NMIs.append(nmi_val)
                ARIs.append(ari_val)
                AMIs.append(ami_val)
                CCs.append(cc_val)

            # print("  ", alg_name, len(NMIs))


            if np.mean(NOISEs) > 0.99999999:
                data_ext[(dataset, "mean")][(alg_name, "NMI")] = 0.0
                data_ext[(dataset, "std")][(alg_name, "NMI")] = 0.0
                data_ext[(dataset, "mean")][(alg_name, "ARI")] = 0.0
                data_ext[(dataset, "std")][(alg_name, "ARI")] = 0.0
                data_ext[(dataset, "mean")][(alg_name, "AMI")] = 0.0
                data_ext[(dataset, "std")][(alg_name, "AMI")] = 0.0
                data_ext[(dataset, "mean")][(alg_name, "CC")] = 0.0
                data_ext[(dataset, "std")][(alg_name, "CC")] = 0.0
            else:
                data_ext[(dataset, "mean")][(alg_name, "NMI")] = np.mean(NMIs)
                data_ext[(dataset, "std")][(alg_name, "NMI")] = np.std(NMIs)
                data_ext[(dataset, "mean")][(alg_name, "ARI")] = np.mean(ARIs)
                data_ext[(dataset, "std")][(alg_name, "ARI")] = np.std(ARIs)
                data_ext[(dataset, "mean")][(alg_name, "AMI")] = np.mean(AMIs)
                data_ext[(dataset, "std")][(alg_name, "AMI")] = np.std(AMIs)
                data_ext[(dataset, "mean")][(alg_name, "CC")] = np.mean(CCs)
                data_ext[(dataset, "std")][(alg_name, "CC")] = np.std(CCs)

            data_ext[(dataset, "mean")][(alg_name, "runtime")] = np.mean(RUNTIMEs)
            data_ext[(dataset, "std")][(alg_name, "runtime")] = np.std(RUNTIMEs)

            data_ext[(dataset, "mean")][(alg_name, "noise")] = np.mean(NOISEs)
            data_ext[(dataset, "std")][(alg_name, "noise")] = np.std(NOISEs)

            data_ext[(dataset, "mean")][(alg_name, "n_clusters")] = np.mean(n_clusters)
            data_ext[(dataset, "std")][(alg_name, "n_clusters")] = np.std(n_clusters)

            data_ext[(dataset, "mean")][(alg_name, "runs")] = len(paths_)
    print()

df = pd.DataFrame.from_dict(data_ext, orient="index")

d31: AMD_DBSCAN_python, DPC_python, GaussianMixture_python, HDBSCAN_python, LDClus_python, OPTICS_python, SCAR_python, Spectacl_python, agglomerative_average, agglomerative_average_500, agglomerative_complete, agglomerative_complete_500, agglomerative_median, agglomerative_median_500, agglomerative_single, agglomerative_single_500, agglomerative_ward_python, euclidean_k_center, euclidean_k_center_500, euclidean_k_means, euclidean_k_means_500, k-means_python, k-means_python_500, CoverTree_kcenter, CoverTree_kcenter_elbow, CoverTree_kmeans, CoverTree_kmeans_elbow, CoverTree_kmeans_ensemble_mean, CoverTree_kmeans_ensemble_med, CoverTree_kmeans_norm_stability, CoverTree_kmeans_stability, CoverTree_kmedian, CoverTree_kmedian_elbow, CoverTree_kmedian_ensemble_mean, CoverTree_kmedian_ensemble_med, CoverTree_kmedian_norm_stability, CoverTree_kmedian_stability, CoverTree_normalized_stability, CoverTree_stability, DCTree_kcenter, DCTree_kcenter_elbow, DCTree_kmeans, DCTree_kmeans_elbow, DCTree_k

In [6]:
import numpy as np
import pandas as pd
import glob
import re

from os.path import exists
from collections import defaultdict


paths = [
    "/export/share/##42h8##/HCF/TreeUsageImpl/oldpythonsetup/labels_competitors/*/",
    "/export/share/##42h8##/HCF_HST/TreeUsageImpl/oldpythonsetup/labels/*/"
]


data_ext_ = {}

for dataset in Datasets:
    print(dataset.name + ": ", end="")
    dataset = dataset.name

    data_ext_[dataset, "mean"] = {}
    data_ext_[dataset, "std"] = {}

    for path in paths:
        alg_dict = defaultdict(list)
        for build_type in [
                "DCTree_build", "HST_build", "CoverTree_build", "KDTree_build",
                "DCTree_opt_build_centroid", "HST_opt_build_centroid", "CoverTree_opt_build_centroid", "KDTree_opt_build_centroid",
                "DCTree_fast_build_centroid", "HST_fast_build_centroid", "CoverTree_fast_build_centroid", "KDTree_fast_build_centroid",
            ]:
            file_paths = (glob.glob(path + f"{dataset}##{build_type}*##time.npy"))
            alg_dict[build_type] = file_paths
        alg_dict = sorted(alg_dict.items())

        for alg_name, paths_ in alg_dict:
            print(alg_name + ", ", end="")
            RUNTIMEs = []

            for file_path in paths_:
                runtime = int(np.load(file_path))
                RUNTIMEs.append(runtime)

            data_ext_[(dataset, "mean")][(alg_name, "runtime")] = np.mean(RUNTIMEs)
            data_ext_[(dataset, "std")][(alg_name, "runtime")] = np.std(RUNTIMEs)
    print()

df_build_times = pd.DataFrame.from_dict(data_ext_, orient="index")

d31: CoverTree_build, 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


CoverTree_fast_build_centroid, CoverTree_opt_build_centroid, DCTree_build, DCTree_fast_build_centroid, DCTree_opt_build_centroid, HST_build, HST_fast_build_centroid, HST_opt_build_centroid, KDTree_build, KDTree_fast_build_centroid, KDTree_opt_build_centroid, CoverTree_build, CoverTree_fast_build_centroid, CoverTree_opt_build_centroid, DCTree_build, DCTree_fast_build_centroid, DCTree_opt_build_centroid, HST_build, HST_fast_build_centroid, HST_opt_build_centroid, KDTree_build, KDTree_fast_build_centroid, KDTree_opt_build_centroid, 
compound_left: CoverTree_build, CoverTree_fast_build_centroid, CoverTree_opt_build_centroid, DCTree_build, DCTree_fast_build_centroid, DCTree_opt_build_centroid, HST_build, HST_fast_build_centroid, HST_opt_build_centroid, KDTree_build, KDTree_fast_build_centroid, KDTree_opt_build_centroid, CoverTree_build, CoverTree_fast_build_centroid, CoverTree_opt_build_centroid, DCTree_build, DCTree_fast_build_centroid, DCTree_opt_build_centroid, HST_build, HST_fast_build_

In [None]:
import glob
import numpy as np

# for path in glob.glob("/export/share/##42h8##/HCF/TreeUsageImpl/oldpythonsetup/labels_competitors/real_world/MNIST##*##*##time.npy"):
for path in glob.glob("/export/share/##42h8##/HCF_HST/TreeUsageImpl/oldpythonsetup/labels/real_world/MNIST##DCTree_build##*##time.npy"):
    print(path, np.load(path))

In [8]:
df_merged = pd.concat((df, df_build_times), axis=1)
# df_merged[df_merged.index == ('COIL100', 'mean')][("euclidean_k_means_500", "runtime")]
# df_merged

In [9]:
df_merged.loc[:, df_merged.columns == ("DCTree_kmedian_ensemble_med", "runtime")] = df_merged.loc[:, df_merged.columns == ("DCTree_kmedian_ensemble_med", "runtime")].add(4 * df_merged.loc[:, df_merged.columns == ("DCTree_fast_build_centroid", "runtime")].rename(columns={"DCTree_fast_build_centroid": "DCTree_kmedian_ensemble_med"}))

In [10]:
from JupyterNotebooks.dataset_abbrev import DATASET_ABBREV, DATASET_selection, DATASET_selection_small
# from JupyterNotebooks.method_abbrev import METHOD_ABBREV, METHODS_competitors, METHODS_dcTree, METHODS_kdTree, METHODS_dcTree_times, METHODS_kdTree_times
from JupyterNotebooks.method_abbrev import METHOD_ABBREV, METHODS_final, METHODS_final_times, METHODS_final_small, METHODS_final_times_small

def rename_filter_df(df, row_selection=[], row_abbrev={}, column_abbrev={}, column_selection=[], metric_selection=[]):
    df2 = df
    ## Datasets
    df2 = df2.reindex(row_selection, level=0)
    df2 = df2.rename(index=row_abbrev, level=0)

    ## Methods
    df2 = df2.rename(columns=column_abbrev, level=0)
    df2 = df2.reindex(columns=column_selection, level=0)

    ## Metrics
    df2 = df2.reindex(columns=metric_selection, level=1)
    return df2

# df2 = df2.reindex(columns=["ARI", "NMI", "noise", "n_clusters", "runs"], level=1)
# df_ari_comp = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_competitors, ["ARI"])
# df_ari_dc = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_dcTree, ["ARI"])
# df_ari_kd = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_kdTree, ["ARI"])
df_ari_small = rename_filter_df(df_merged, DATASET_selection_small, DATASET_ABBREV, METHOD_ABBREV, METHODS_final_small, ["ARI"])

df_ari = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_final, ["ARI"])
df_nmi = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_final, ["NMI"])
df_ami = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_final, ["AMI"])
df_cc = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_final, ["CC"])


# df_runtime_dc = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_dcTree_times, ["runtime"])
# df_runtime_kd = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_kdTree_times, ["runtime"])
df_runtime = rename_filter_df(df_merged, DATASET_selection, DATASET_ABBREV, METHOD_ABBREV, METHODS_final_times, ["runtime"])
df_runtime_small = rename_filter_df(df_merged, DATASET_selection_small, DATASET_ABBREV, METHOD_ABBREV, METHODS_final_times_small, ["runtime"])
# df2 = df2.reindex(columns=["runtime"], level=1)


In [11]:
def convert_to_time_format(value):
    if np.isnan(value):
        return "-"
    value = round(value)
    milliseconds = value % 1000
    value //= 1000
    seconds = value % 60
    value //= 60
    minutes = value
    return f"{minutes:02}:{seconds:02}.{milliseconds:03}"

df_runtime = df_runtime.map(convert_to_time_format)
df_runtime_small = df_runtime_small.map(convert_to_time_format)

In [29]:
# df_runtime[df_runtime.index.get_level_values(0) == "COIL100"]
# df_runtime
# df_merged.loc[df_merged.index.get_level_values(1) == "mean", df_merged.columns.get_level_values(1) == "ARI"].columns.get_level_values(0)
# df_merged.loc[df_merged.index.get_level_values(1) == "mean",(df_merged.columns == ("OPTICS_python", "runtime")) | (df_merged.columns == ("AMD_DBSCAN_python", "runtime"))]

In [30]:
# df_to_latex(df_ari_comp, f"{path}/ari_comp.tex")
# df_to_latex(df_ari_dc, f"{path}/ari_dc.tex")
# df_to_latex(df_ari_kd, f"{path}/ari_kd.tex")
# !cat <(echo '\newpage') "{path}/ari_comp.tex" <(echo '\newpage') "{path}/ari_dc.tex" <(echo '\newpage') "{path}/ari_kd.tex" > "{path}/all_ari_values.tex"

# df_to_latex(df_runtime_dc, f"{path}/runtime_dc.tex")
# df_to_latex(df_runtime_kd, f"{path}/runtime_kd.tex")  
# !cat <(echo '\newpage') "{path}/runtime_dc.tex" <(echo '\newpage') "{path}/runtime_kd.tex" > "{path}/all_times.tex"

In [None]:
# df_ari_small.loc[df_ari_small.index.get_level_values(1) == "std", (df_ari_small.columns.get_level_values(0) == "DC/$k$-center/Stability") | (df_ari_small.columns.get_level_values(0) == "DC/$k$-median/MoE") | (df_ari_small.columns.get_level_values(0) == "DC/$k$-means/Elbow")] = 0
# df_ari.loc[df_ari.index.get_level_values(1) == "std", (df_ari.columns.get_level_values(0) == "DC/$k$-center/Stability") | (df_ari.columns.get_level_values(0) == "DC/$k$-median/MoE") | (df_ari.columns.get_level_values(0) == "DC/$k$-means/Elbow")] = 0
# df_nmi.loc[df_nmi.index.get_level_values(1) == "std", (df_nmi.columns.get_level_values(0) == "DC/$k$-center/Stability") | (df_nmi.columns.get_level_values(0) == "DC/$k$-median/MoE") | (df_nmi.columns.get_level_values(0) == "DC/$k$-means/Elbow")] = 0
# df_ami.loc[df_ami.index.get_level_values(1) == "std", (df_ami.columns.get_level_values(0) == "DC/$k$-center/Stability") | (df_ami.columns.get_level_values(0) == "DC/$k$-median/MoE") | (df_ami.columns.get_level_values(0) == "DC/$k$-means/Elbow")] = 0
# df_cc.loc[df_cc.index.get_level_values(1) == "std", (df_cc.columns.get_level_values(0) == "DC/$k$-center/Stability") | (df_cc.columns.get_level_values(0) == "DC/$k$-median/MoE") | (df_cc.columns.get_level_values(0) == "DC/$k$-means/Elbow")] = 0


# df_ari_small.loc[df_ari_small.index.get_level_values(1) == "std", (df_ari_small.columns.get_level_values(0) == "DC/$k$-center/GT") | (df_ari_small.columns.get_level_values(0) == "DC/$k$-median/GT") | (df_ari_small.columns.get_level_values(0) == "DC/$k$-means/GT")] = 0
# df_ari.loc[df_ari.index.get_level_values(1) == "std", (df_ari.columns.get_level_values(0) == "DC/$k$-center/GT") | (df_ari.columns.get_level_values(0) == "DC/$k$-median/GT") | (df_ari.columns.get_level_values(0) == "DC/$k$-means/GT")] = 0
# df_nmi.loc[df_nmi.index.get_level_values(1) == "std", (df_nmi.columns.get_level_values(0) == "DC/$k$-center/GT") | (df_nmi.columns.get_level_values(0) == "DC/$k$-median/GT") | (df_nmi.columns.get_level_values(0) == "DC/$k$-means/GT")] = 0
# df_ami.loc[df_ami.index.get_level_values(1) == "std", (df_ami.columns.get_level_values(0) == "DC/$k$-center/GT") | (df_ami.columns.get_level_values(0) == "DC/$k$-median/GT") | (df_ami.columns.get_level_values(0) == "DC/$k$-means/GT")] = 0
# df_cc.loc[df_cc.index.get_level_values(1) == "std", (df_cc.columns.get_level_values(0) == "DC/$k$-center/GT") | (df_cc.columns.get_level_values(0) == "DC/$k$-median/GT") | (df_cc.columns.get_level_values(0) == "DC/$k$-means/GT")] = 0

# df_ari_small.loc[df_ari_small.index.get_level_values(1) == "std", (df_ari_small.columns.get_level_values(0) == "KD/$k$-center/Stability") | (df_ari_small.columns.get_level_values(0) == "KD/$k$-median/MoE") | (df_ari_small.columns.get_level_values(0) == "KD/$k$-means/Elbow")] = 0
# df_ari.loc[df_ari.index.get_level_values(1) == "std", (df_ari.columns.get_level_values(0) == "KD/$k$-center/Stability") | (df_ari.columns.get_level_values(0) == "KD/$k$-median/MoE") | (df_ari.columns.get_level_values(0) == "KD/$k$-means/Elbow")] = 0
# df_nmi.loc[df_nmi.index.get_level_values(1) == "std", (df_nmi.columns.get_level_values(0) == "KD/$k$-center/Stability") | (df_nmi.columns.get_level_values(0) == "KD/$k$-median/MoE") | (df_nmi.columns.get_level_values(0) == "KD/$k$-means/Elbow")] = 0
# df_ami.loc[df_ami.index.get_level_values(1) == "std", (df_ami.columns.get_level_values(0) == "KD/$k$-center/Stability") | (df_ami.columns.get_level_values(0) == "KD/$k$-median/MoE") | (df_ami.columns.get_level_values(0) == "KD/$k$-means/Elbow")] = 0
# df_cc.loc[df_cc.index.get_level_values(1) == "std", (df_cc.columns.get_level_values(0) == "KD/$k$-center/Stability") | (df_cc.columns.get_level_values(0) == "KD/$k$-median/MoE") | (df_cc.columns.get_level_values(0) == "KD/$k$-means/Elbow")] = 0


In [13]:
from clustpy.utils import evaluation_df_to_latex_table
from utils.latex_pandas import regex_file, run_regex, latex_coloring, latex_bold_underline


def df_to_latex_ari(df, latex_path, kwargs={}, categories=[]):
    evaluation_df_to_latex_table(df, latex_path, best_in_bold=False, second_best_underlined=False, in_percent=True, decimal_places=1)
    # latex_bold_underline(latex_path, skiprows=6)
    latex_coloring(latex_path, skiprows=6, axis=None)
    regex_file(latex_path, "", remove_second_column=False, categories=categories)

path = "/export/share/##42h8##/HCF_HST/experiments/JupyterNotebooks/latex_tables"

df_to_latex_ari(df_nmi, f"{path}/nmi.tex", categories=[("Boxes", 17, "Density-based 2D-Data"), ("Synth_low", 8, "Tabular Data"), ("COIL20", 6, "Image Data")])
df_to_latex_ari(df_ari, f"{path}/ari.tex", categories=[("Boxes", 17, "Density-based 2D-Data"), ("Synth_low", 8, "Tabular Data"), ("COIL20", 6, "Image Data")])
df_to_latex_ari(df_ami, f"{path}/ami.tex", categories=[("Boxes", 17, "Density-based 2D-Data"), ("Synth_low", 8, "Tabular Data"), ("COIL20", 6, "Image Data")])
df_to_latex_ari(df_cc, f"{path}/cc.tex", categories=[("Boxes", 17, "Density-based 2D-Data"), ("Synth_low", 8, "Tabular Data"), ("COIL20", 6, "Image Data")])
df_to_latex_ari(df_ari_small, f"{path}/ari_small.tex", categories=[("Boxes", 7, "Tabular Data"), ("COIL20", 6, "Image Data")])

In [14]:
from clustpy.utils import evaluation_df_to_latex_table
from utils.latex_pandas import regex_file, run_regex, latex_coloring


def df_to_latex_runtime(df, latex_path, kwargs={}, categories=[]):
    evaluation_df_to_latex_table(df, latex_path, use_std=False, best_in_bold=False, second_best_underlined=False, in_percent=False, no_numbers=True, **kwargs)
    regex_file(latex_path, "", remove_second_column=True, categories=categories)
    run_regex([
    #     r'if ($. == 10) { s/(?<=& )(.*?) (&|\\\\)/\\rotatebox[origin=c]\{70\}\{$1\} $2/g }',
    #     r'if ($. % 2 == 1) { s/((?<=& )|(?<=^))(.*?) (&|\\\\)/\\cellcolor\{Gray!20\}\{$2\} $3/g }',
        r's/\$//g',
    ], latex_path)

path = "/export/share/##42h8##/HCF_HST/experiments/JupyterNotebooks/latex_tables"

df_to_latex_runtime(df_runtime, f"{path}/runtime.tex") #, categories=[("d31", 7, "Tabular Data"), ("COIL20", 6, "Image Data")])
df_to_latex_runtime(df_runtime_small, f"{path}/runtime_small.tex") #, categories=[("Boxes", 7, "Tabular Data"), ("COIL20", 6, "Image Data")])

In [69]:
selected_datasets = ["Boxes", "D31", "COIL100", "MNIST"]
selected_methods = [
    "DC/$k$-center/GT",
    "DC/$k$-median/GT",
    "DC/$k$-means/GT",
    "DC/$k$-center/Stability",
    "DC/$k$-median/MoE",
    "DC/$k$-means/Elbow",
    "CT/$k$-center/Stability",
    "CT/$k$-median/MoE",
    "CT/$k$-means/Elbow",
    "Eucl. $k$-means",
    "SCAR",
    "Aggl. (ward)",
    "AMD-DBSCAN",
    "DPC",
]

In [70]:
df_ami_mean[selected_methods]


Unnamed: 0,DC/$k$-center/GT,DC/$k$-median/GT,DC/$k$-means/GT,DC/$k$-center/Stability,DC/$k$-median/MoE,DC/$k$-means/Elbow,CT/$k$-center/Stability,CT/$k$-median/MoE,CT/$k$-means/Elbow,Eucl. $k$-means,SCAR,Aggl. (ward),AMD-DBSCAN,DPC
Boxes,0.86,0.99,0.99,0.95,0.99,0.98,0.44,0.75,0.69,0.95,0.09,0.97,0.57,0.36
D31,0.79,0.96,0.96,0.85,0.81,0.94,0.72,0.84,0.85,0.95,0.81,0.95,0.89,0.34
COIL100,0.71,0.88,0.88,0.87,0.87,0.88,0.64,0.74,0.75,0.8,0.68,0.84,0.49,0.01
MNIST,0.0,0.62,0.63,0.1,0.58,0.63,0.36,0.41,0.41,0.49,0.19,0.68,0.0,


In [None]:
df_ami_mean = df_ami.loc[df_ami.index.get_level_values(1) == "mean"]
df_ami_mean = df_ami_mean.loc[
    np.array([(df_ami_mean.index.get_level_values(0) == dataset) for dataset in selected_datasets]).any(axis=0), :
]
df_ami_mean = df_ami_mean[selected_methods]
df_ami_mean.columns = df_ami_mean.columns.get_level_values(0)
df_ami_mean.index = df_ami_mean.index.get_level_values(0)
df_ami_mean = df_ami_mean.round(2)
re.compile(r"-+").sub("-", df_ami_mean.to_markdown().replace(" ", ""))

'||DC/$k$-center/GT|DC/$k$-median/GT|DC/$k$-means/GT|DC/$k$-center/Stability|DC/$k$-median/MoE|DC/$k$-means/Elbow|CT/$k$-center/Stability|CT/$k$-median/MoE|CT/$k$-means/Elbow|Eucl.$k$-means|SCAR|Aggl.(ward)|AMD-DBSCAN|DPC|\n|:-|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|\n|Boxes|0.86|0.99|0.99|0.95|0.99|0.98|0.44|0.75|0.69|0.95|0.09|0.97|0.57|0.36|\n|D31|0.79|0.96|0.96|0.85|0.81|0.94|0.72|0.84|0.85|0.95|0.81|0.95|0.89|0.34|\n|COIL100|0.71|0.88|0.88|0.87|0.87|0.88|0.64|0.74|0.75|0.8|0.68|0.84|0.49|0.01|\n|MNIST|0|0.62|0.63|0.1|0.58|0.63|0.36|0.41|0.41|0.49|0.19|0.68|0|nan|'

In [73]:
df_cc_mean = df_cc.loc[df_cc.index.get_level_values(1) == "mean"]
df_cc_mean = df_cc_mean.loc[
    np.array([(df_cc_mean.index.get_level_values(0) == dataset) for dataset in selected_datasets]).any(axis=0), :
]
df_cc_mean = df_cc_mean[selected_methods]
df_cc_mean.columns = df_cc_mean.columns.get_level_values(0)
df_cc_mean.index = df_cc_mean.index.get_level_values(0)
df_cc_mean = df_cc_mean.round(2)
re.compile(r"-+").sub("-", df_cc_mean.to_markdown().replace(" ", ""))


'||DC/$k$-center/GT|DC/$k$-median/GT|DC/$k$-means/GT|DC/$k$-center/Stability|DC/$k$-median/MoE|DC/$k$-means/Elbow|CT/$k$-center/Stability|CT/$k$-median/MoE|CT/$k$-means/Elbow|Eucl.$k$-means|SCAR|Aggl.(ward)|AMD-DBSCAN|DPC|\n|:-|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|-:|\n|Boxes|0.7|0.99|0.99|0.9|0.99|0.98|0.11|0.5|0.36|0.93|0.01|0.96|0.69|0.28|\n|D31|0.66|0.94|0.94|0.8|0.52|0.84|0.53|0.63|0.68|0.92|0.46|0.92|0.87|0.19|\n|COIL100|0.28|0.71|0.7|0.81|0.68|0.71|0.45|0.48|0.51|0.56|0.22|0.62|0.2|0.03|\n|MNIST|0.01|0.51|0.52|0.33|0.45|0.5|0.15|0.13|0.13|0.37|0.03|0.53|0|nan|'

In [295]:
import pandas as pd
import numpy as np
from clustpy.utils import evaluation_df_to_latex_table

latex_path = "latex/kd-tree.tex"
metric_path = "../TreeUsageImpl/oldpythonsetup/metric_save/metrics_20:46:47.csv"  # kd-tree
# metric_path = "../TreeUsageImpl/oldpythonsetup/metric_save/metrics_11:47:52.csv"  # dc-tree

df = pd.read_csv(metric_path, delimiter=';', index_col=0, skiprows=[1], header=[0,1])
df = df.rename(columns = dict(zip([column[0] for column in df.columns[1::2]], [column[0] for column in df.columns[::2]])))
df.drop(df.tail(1).index, inplace=True) # drop last n rows

df.drop(["d31", "compound_left", "aggregate"], inplace=True) # drop last n rows

df

Unnamed: 0_level_0,kd-Tree_kcenter,kd-Tree_kcenter,kd-Tree_kmeans,kd-Tree_kmeans,kd-Tree_kmeans_elbow,kd-Tree_kmeans_elbow,kd-Tree_euclidean_k_center,kd-Tree_euclidean_k_center,kd-Tree_kmeans_norm_stability,kd-Tree_kmeans_norm_stability,kmeans_euc_gt,kmeans_euc_gt
Unnamed: 0_level_1,ari,nmi,ari,nmi,ari,nmi,ari,nmi,ari,nmi,ari,nmi
airway,0.499,0.65,0.371,0.597,0.087,0.473,0.488,0.633,0.816,0.755,0.395,0.634
optdigits,0.111,0.22,0.132,0.244,0.02,0.421,0.281,0.446,0.121,0.333,0.671,0.757
htru2,-0.001,0.0,0.013,0.0,-0.0,0.001,-0.001,0.0,-0.0,0.001,-0.078,0.027
pendigits,0.095,0.243,0.161,0.31,0.044,0.5,0.337,0.501,0.165,0.47,0.595,0.691
mice,0.001,0.068,0.002,0.087,0.009,0.116,0.002,0.062,-0.0,0.045,0.118,0.22
letterrecognition,0.057,0.204,0.061,0.208,0.043,0.437,0.064,0.26,0.046,0.174,0.133,0.352
har,0.095,0.257,0.271,0.405,0.015,0.293,0.327,0.533,0.201,0.326,0.461,0.589
coil20,0.22,0.435,0.213,0.437,0.169,0.566,0.285,0.585,0.227,0.513,0.497,0.737
mnist,0.0,0.007,0.095,0.186,0.002,0.266,0.024,0.178,0.028,0.234,0.354,0.484


In [296]:
from collections import defaultdict

data_dict = defaultdict(dict)
for (clusterer, eval_method), subdict in df.to_dict().items():
    for dataset, value in subdict.items():
        data_dict[(clusterer, dataset)][(eval_method, "mean")] = value

data_dict

df_ = pd.DataFrame(data_dict)
df_.drop(("nmi", "mean"), inplace=True)
df_

Unnamed: 0_level_0,Unnamed: 1_level_0,kd-Tree_kcenter,kd-Tree_kcenter,kd-Tree_kcenter,kd-Tree_kcenter,kd-Tree_kcenter,kd-Tree_kcenter,kd-Tree_kcenter,kd-Tree_kcenter,kd-Tree_kcenter,kd-Tree_kmeans,...,kd-Tree_kmeans_norm_stability,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt
Unnamed: 0_level_1,Unnamed: 1_level_1,airway,optdigits,htru2,pendigits,mice,letterrecognition,har,coil20,mnist,airway,...,mnist,airway,optdigits,htru2,pendigits,mice,letterrecognition,har,coil20,mnist
ari,mean,0.499,0.111,-0.001,0.095,0.001,0.057,0.095,0.22,0.0,0.371,...,0.028,0.395,0.671,-0.078,0.595,0.118,0.133,0.461,0.497,0.354


In [297]:
from clustpy.utils import evaluation_df_to_latex_table
from utils.latex_pandas import regex_file, run_regex

evaluation_df_to_latex_table(
    df_,
    latex_path,
    color_by_value="Green",
    best_in_bold=False,
    second_best_underlined=False,
    in_percent=False,
    decimal_places=2,
)

regex_file(latex_path, "")
run_regex([
    r's/^(ari )?& //g',
    r's/-0\.00/0.00/g',
], latex_path)

In [282]:
import pandas as pd

latex_path = "latex/times.tex"
metric_path_kdtree = "../TreeUsageImpl/oldpythonsetup/metric_save/times_20:46:48.csv"  # kd-tree
metric_path_dctree = "../TreeUsageImpl/oldpythonsetup/metric_save/times_11:47:53.csv"  # dctree
metric_path_dbscan = "../TreeUsageImpl/oldpythonsetup/metric_save/times_11:48:37.csv"  # dbscan
metric_path_aggl = "../TreeUsageImpl/oldpythonsetup/metric_save/times_11:48:53.csv"  # aggl
metric_path_dpc = "../TreeUsageImpl/oldpythonsetup/metric_save/times_12:20:36.csv"  # dpc

df_kdtree = pd.read_csv(metric_path_kdtree, delimiter=';', index_col=0)
# df_kdtree.columns = "kd-tree " + df_kdtree.columns
df_dctree = pd.read_csv(metric_path_dctree, delimiter=';', index_col=0)
# df_dctree.columns = "dc-tree " + df_dctree.columns
df_dbscan = pd.read_csv(metric_path_dbscan, delimiter=';', index_col=0, usecols=["dbscan"])
df_dbscan["dataset"] = df_dctree.index
df_dbscan = df_dbscan.reset_index()
df_dbscan = df_dbscan.set_index("dataset")
df_aggl = pd.read_csv(metric_path_aggl, delimiter=';', index_col=0, usecols=["agglomerative"])
df_aggl["dataset"] = df_dctree.index
df_aggl = df_aggl.reset_index()
df_aggl = df_aggl.set_index("dataset")
df_dpc = pd.read_csv(metric_path_dpc, delimiter=';', index_col=0, usecols=["dpc"])
df_dpc["dataset"] = df_dctree.index
df_dpc = df_dpc.reset_index()
df_dpc = df_dpc.set_index("dataset")

# df = pd.concat([df_kdtree, df_dctree, df_dbscan, df_aggl, df_dpc], axis=1)
df = pd.concat([df_kdtree], axis=1)
# df = pd.concat([df_dctree, df_dbscan, df_aggl, df_dpc], axis=1)

In [283]:
df

Unnamed: 0,kd-tree_build,kmeans-tree_build,kcenter,kmeans,kmeans_elbow,euclidean_k_center,kmeans_norm_stability,kmeans_euc_gt
airway,23.02,8.134,2.779,3.467,3.399,5.253,1.132,143.234
optdigits,32.257,2.547,1.148,1.3,1.233,8.592,0.512,111.372
htru2,24.728,8.999,3.272,3.893,3.777,1.254,1.354,110.129
pendigits,20.165,4.793,2.216,2.679,2.623,5.68,0.997,114.465
mice,6.974,0.329,0.179,0.167,0.145,1.35,0.058,103.666
letterrecognition,27.926,8.947,3.018,3.841,3.656,18.835,1.487,144.929
har,449.386,3.979,1.901,2.25,2.191,83.854,1.762,234.413
d31,2.402,0.858,0.46,0.49,0.393,2.198,0.171,9.195
compound_left,0.211,0.062,0.036,0.032,0.023,0.032,0.01,2.205
aggregate,0.574,0.187,0.1,0.09,0.076,0.136,0.024,2.617


In [284]:
from collections import defaultdict

data_dict = defaultdict(dict)
for clusterer, subdict in df.to_dict().items():
    for dataset, value in subdict.items():
        data_dict[(clusterer, dataset)][("time", "mean")] = value

data_dict

df_ = pd.DataFrame(data_dict)
df_ = df_.replace({np.nan: -1})
df_

Unnamed: 0_level_0,Unnamed: 1_level_0,kd-tree_build,kd-tree_build,kd-tree_build,kd-tree_build,kd-tree_build,kd-tree_build,kd-tree_build,kd-tree_build,kd-tree_build,kd-tree_build,...,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt,kmeans_euc_gt
Unnamed: 0_level_1,Unnamed: 1_level_1,airway,optdigits,htru2,pendigits,mice,letterrecognition,har,d31,compound_left,aggregate,...,htru2,pendigits,mice,letterrecognition,har,d31,compound_left,aggregate,coil20,mnist
time,mean,23.02,32.257,24.728,20.165,6.974,27.926,449.386,2.402,0.211,0.574,...,110.129,114.465,103.666,144.929,234.413,9.195,2.205,2.617,568.614,1820.442


In [285]:
from clustpy.utils import evaluation_df_to_latex_table
from utils.latex_pandas import regex_file, run_regex

evaluation_df_to_latex_table(
    df_,
    latex_path,
    color_by_value="Green",
    best_in_bold=False,
    second_best_underlined=False,
    in_percent=False,
    decimal_places=2,
)

regex_file(latex_path, "")
run_regex([
    r's/^(time )?& //g',
    r's/-1\.00/-/g',
], latex_path)

In [None]:
runtypes_dctree_and_kdtree = [
    "stability", 
    "normalized_stability", 
    "kmedian", 
    "kmeans", 
    "kmedian_elbow", 
    "kmeans_elbow", 
    "kcenter", 
    "kcenter_elbow", 
    "kcenter_q_coverage", 
    "kmedian_q_coverage", 
    "kmeans_q_coverage", 
    "kmedian_lca_noise", 
    "kmeans_lca_noise", 
    "kmeans_lca_noise_real", 
    "KFive", 
    "KFive_elbow", 
    "kcenter_q_stem", 
    "kmeans_q_stem", 
    "kmedian_q_stem", 
    "kmeans_norm_stability", 
    "kmedian_norm_stability", 
    "kmeans_stability",
    "kmedian_stability",
]


In [None]:
runtypes_dctree_and_kdtree = [
    "kcenter", 
    "kmedian", 
    "kmeans", 
    "kcenter_elbow", 
    "kmedian_elbow", 
    "kmeans_elbow", 
    "kcenter_q_coverage", 
    "kmedian_q_coverage", 
    "kmeans_q_coverage", 
    "kcenter_q_stem", 
    "kmedian_q_stem", 
    "kmeans_q_stem", 
    "stability",
    "kmedian_stability",
    "kmeans_stability",
    "normalized_stability", 
    "kmedian_norm_stability", 
    "kmeans_norm_stability", 
    "kmedian_lca_noise", 
    "kmeans_lca_noise", 
    "kmeans_lca_noise_real", 
    "KFive", 
    "KFive_elbow", 
]


In [251]:
## RENAME ## 
import os
for path in glob.iglob(rf"./../TreeUsageImpl/oldpythonsetup/labels/examples/*aggregation*.npy"):
    new_path = re.sub(r'aggregation', "aggregation_2", path)
    print(path, new_path)
    # os.rename(path, new_path)

./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2##kdTree_kmeans##5.npy ./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2_2##kdTree_kmeans##5.npy
./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2##dcTree_KFive##4##time.npy ./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2_2##dcTree_KFive##4##time.npy
./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2##GaussianMixture_python##6##time.npy ./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2_2##GaussianMixture_python##6##time.npy
./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2##dcTree_prune_tree##3##time.npy ./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2_2##dcTree_prune_tree##3##time.npy
./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2##kdTree_KFive##9##time.npy ./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregation_2_2##kdTree_KFive##9##time.npy
./../TreeUsageImpl/oldpythonsetup/labels/examples/aggregatio