In [None]:
import pandas as pd
import sys
sys.path.append("..")
from discotec import *
import uci_data
from sklearn import cluster, preprocessing
from scipy import stats
import os
from glob import glob
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import pickle
import itertools

In [None]:
# Some important parameters
dataset_folder = "data"
result_folder = "results"


if not os.path.exists(result_folder):
    os.makedirs(result_folder)

# Step 1 - Perform clustering

In [None]:
for dataset_loader in [fct for fct in dir(uci_data) if "load_" in fct]:
    dataset_name = dataset_loader[5:]
    print(dataset_name)

    dataset_file = os.path.join(dataset_folder, dataset_name+"_X.csv")
    if not os.path.exists(dataset_file):
        print(f"Retrieving {dataset_name}")
        X, y = getattr(uci_data, dataset_loader)()
        pd.DataFrame(X).to_csv(dataset_file, index=False)
        pd.DataFrame(y).to_csv(dataset_file.replace("X.csv","y.csv"), index=False)
    else:
        print(f"Loading {dataset_name}")
        X = pd.read_csv(dataset_file).to_numpy()
        y = pd.read_csv(dataset_file.replace("X.csv","y.csv")).to_numpy().reshape(-1)

    K = len(jnp.unique(y))

    filename = os.path.join(result_folder, f"{dataset_name}_kmeans.pkl")
    if not os.path.exists(filename):
        predictions = []
        for _ in range(50):
            model = cluster.KMeans(n_clusters=K)
            y_pred = model.fit_predict(X).reshape((1,-1))
            predictions += [y_pred]
        predictions = jnp.concatenate(predictions, axis=0)
        with open(filename, "wb") as file:
            pickle.dump(predictions, file)
            


    filename = os.path.join(result_folder, f"{dataset_name}_agglomerative.pkl")
    if not os.path.exists(filename):
        predictions = []
        for linkage in ["single", "complete", "ward", "average"]:
            for metric in ["euclidean", "manhattan"]:
                if metric=="manhattan" and linkage=="ward":
                    continue
                model = cluster.AgglomerativeClustering(n_clusters=K, linkage=linkage, metric=metric)
            y_pred = model.fit_predict(X).reshape((1,-1))
            predictions += [y_pred]
        predictions = jnp.concatenate(predictions, axis=0)
        with open(filename, "wb") as file:
            pickle.dump(predictions, file)

In [None]:
plt.figure(figsize=(15,10))
for i, dataset_file in enumerate(glob(os.path.join(dataset_folder, "*_X.csv"))):
    plt.subplot(3,3,i+1)
    dataset_name = dataset_file.split(os.sep)[-1][:-6]

    targets = pd.read_csv(dataset_file.replace("_X.csv", "_y.csv")).to_numpy().reshape((-1))
    order = jnp.argsort(targets)

    with open(os.path.join(result_folder, f"{dataset_name}_kmeans.pkl"), "rb") as file:
        predictions = pickle.load(file)

    C = compute_consensus_matrix(predictions)

    plt.imshow(C[order][:,order])
    plt.title(f"{dataset_name} models")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15,10))
for i, dataset_file in enumerate(glob(os.path.join(dataset_folder, "*_X.csv"))):
    plt.subplot(3,3,i+1)
    dataset_name = dataset_file.split(os.sep)[-1][:-6]

    targets = pd.read_csv(dataset_file.replace("_X.csv", "_y.csv")).to_numpy().reshape((-1))
    order = jnp.argsort(targets)

    with open(os.path.join(result_folder, f"{dataset_name}_agglomerative.pkl"), "rb") as file:
        predictions = pickle.load(file)

    C = compute_consensus_matrix(predictions)

    plt.imshow(C[order][:,order])
    plt.title(f"{dataset_name} models")

plt.tight_layout()
plt.show()

# Evaluate the scores

In [None]:
for dataset_file in glob(os.path.join(dataset_folder, "*_X.csv")):
    dataset_name = dataset_file.split(os.sep)[-1][:-6]
    result_filename = os.path.join(result_folder, f"{dataset_name}_scores.csv")
    print(dataset_name)

    if os.path.exists(result_filename):
        continue

    all_scores = []
    y_true = pd.read_csv(dataset_file.replace("_X.csv", "_y.csv")).to_numpy().reshape((-1))    # Get the targets

    for model in ["kmeans","agglomerative"]:
        print("Model", model)

        with open(os.path.join(result_folder, f"{dataset_name}_{model}.pkl"), "rb") as file:
            predictions = pickle.load(file)           
    
    
        # Compute the ARI of the clusterings
        print("\tComputing ARI scores")
        ari_scores = [metrics.adjusted_rand_score(y_true, y) for y in predictions]
    
        # Compute all discotec scores
        print("\tComputing discotec scores")
        consensus = compute_consensus_matrix(predictions)
        quantised_consensus = (consensus>consensus.mean()).astype(float)
        
        discotec_tv = -compute_tv_ranking(predictions, consensus)
        discotec_kl = -compute_kl_ranking(predictions, consensus)
        discotec_hellinger = -compute_hellinger_ranking(predictions, consensus)
    
        discotec_quantised = -compute_tv_ranking(predictions, quantised_consensus)
        
        print("\tComputing pairwise scores")
        pairwise_ari_scores = pairwise_score(predictions)
        pairwise_nmi_scores = pairwise_score(predictions, method="nmi")
    
        print("\tStoring results")
        for name, scores in zip(["DISCO_TV", "DISCO_KL", "DISCO_H", "DISCO_Q", "AARI", "ANMI", "ARI"],
                               [discotec_tv, discotec_kl, discotec_hellinger, discotec_quantised,
                                pairwise_ari_scores, pairwise_nmi_scores, ari_scores]):
            for i in range(len(scores)):
                all_scores += [{
                    "Model":model,
                    "Run":i,
                    "Score": name,
                    "Dataset":dataset_name,
                    "Value":scores[i]
                }]
        
    pd.DataFrame(all_scores).to_csv(result_filename, index=False)