In [None]:
import pandas as pd
import sys
sys.path.append("..")
from discotec import *
import numpy as onp
from sklearn import cluster, preprocessing
from scipy import stats
import os
from glob import glob
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import pickle
import itertools

In [None]:
# Some important parameters
dataset_folder = os.path.join("..","fcps_data")
result_folder = "results"

if not os.path.exists(result_folder):
    os.makedirs(result_folder)

# Step 1 - Perform clustering

In [None]:
for dataset in glob(os.path.join(dataset_folder, "*_X.csv")):
    dataset_name = dataset.split(os.sep)[-1][:-6]
    print(dataset_name)
    
    X = pd.read_csv(dataset).to_numpy()
    
    # Preprocess it
    X_scaled = preprocessing.StandardScaler().fit_transform(X)

    filename = os.path.join(result_folder, f"{dataset_name}_kmeans.pkl")
    if not os.path.exists(filename):
        predictions = []
        for k in range(2, 21):
            for _ in range(5):
                model = cluster.KMeans(n_clusters=k)
                y_pred = model.fit_predict(X_scaled).reshape((1,-1))
                predictions += [y_pred]
        predictions = jnp.concatenate(predictions, axis=0)
        with open(filename, "wb") as file:
            pickle.dump(predictions, file)
            
    filename = os.path.join(result_folder, f"{dataset_name}_dbscan.pkl")
    if not os.path.exists(filename):
        predictions = []
        distances = metrics.pairwise_distances(X_scaled)
        distances = distances[jnp.triu_indices(len(distances), k=1)]
        min_d, max_d = jnp.quantile(distances, q=0.01), jnp.quantile(distances, q=0.25)
        for eps in jnp.linspace(min_d, max_d, num=20):
            for _ in range(5):
                model = cluster.DBSCAN(eps=eps.item())
                y_pred = model.fit_predict(X_scaled).reshape((1,-1))
                if len(jnp.unique(y_pred))>1: # Refuse degenerate solutions
                    predictions += [y_pred]
        
        predictions = jnp.concatenate(predictions, axis=0)
        with open(filename, "wb") as file:
            pickle.dump(predictions, file)


    filename = os.path.join(result_folder, f"{dataset_name}_agglomerative.pkl")
    if not os.path.exists(filename):
        predictions = []
        for k in range(2, 21):
            for linkage in ["single", "complete", "ward", "average"]:
                model = cluster.AgglomerativeClustering(n_clusters=k, linkage=linkage)
                y_pred = model.fit_predict(X_scaled).reshape((1,-1))
                predictions += [y_pred]
        predictions = jnp.concatenate(predictions, axis=0)
        with open(filename, "wb") as file:
            pickle.dump(predictions, file)

In [None]:
for dataset_file in glob(os.path.join(dataset_folder, "*_X.csv")):
    dataset_name = dataset_file.split(os.sep)[-1][:-6]
    plt.figure(figsize=(15,5))

    targets = pd.read_csv(dataset_file.replace("_X.csv", "_y.csv")).to_numpy().reshape((-1))
    order = jnp.argsort(targets)

    cumulated_consensus = jnp.zeros((len(targets), len(targets)))
    total_models = 0

    for i, filename in enumerate(glob(os.path.join(result_folder, dataset_name+"_*.pkl"))):
        plt.subplot(1,4,i+1)

        with open(filename, "rb") as file:
            predictions = pickle.load(file)

        model_name = filename.split("_")[-1][:-4] # Remove pkl extension

        C = compute_consensus_matrix(predictions)
        cumulated_consensus += C*len(predictions)
        total_models += len(predictions)

        plt.imshow(C[order][:,order])
        plt.title(f"{model_name} ({len(predictions)} models)")

    plt.subplot(1,4,4)
    plt.imshow((cumulated_consensus/total_models)[order][:,order])
    plt.title(f"Mix ({total_models} models)")
    plt.suptitle(dataset_name)
    plt.tight_layout()
    plt.show()   

# Evaluate the scores

In [None]:
for dataset_file in glob(os.path.join(dataset_folder, "*_X.csv")):
    dataset_name = dataset_file.split(os.sep)[-1][:-6]
    result_filename = os.path.join(result_folder, f"{dataset_name}_scores.csv")
    print(dataset_name)

    if os.path.exists(result_filename):
        continue

    all_scores = []
    y_true = pd.read_csv(dataset_file.replace("_X.csv", "_y.csv")).to_numpy().reshape((-1))    # Get the targets

    for model in ["kmeans","dbscan","agglomerative", "mix"]:
        print("Model", model)

        if model == "mix":
            predictions = []
            for filename in glob(os.path.join(result_folder, dataset_name+"_*.pkl")):
                with open(filename, "rb") as file:
                    predictions += [pickle.load(file)]
            predictions = jnp.concatenate(predictions, axis=0)
        else:
            with open(os.path.join(result_folder, f"{dataset_name}_{model}.pkl"), "rb") as file:
                predictions = pickle.load(file)           
    
    
        # Compute the ARI of the clusterings
        print("\tComputing ARI scores")
        ari_scores = [metrics.adjusted_rand_score(y_true, y) for y in predictions]
    
        # Compute all discotec scores
        print("\tComputing discotec scores")
        consensus = compute_consensus_matrix(predictions)
        quantised_consensus = (consensus>consensus.mean()).astype(float)
        
        discotec_tv = -compute_tv_ranking(predictions, consensus)
        discotec_kl = -compute_kl_ranking(predictions, consensus)
        discotec_hellinger = -compute_hellinger_ranking(predictions, consensus)
    
        discotec_quantised = -compute_tv_ranking(predictions, quantised_consensus)
        
        print("\tComputing pairwise scores")
        pairwise_ari_scores = pairwise_score(onp.array(predictions))
        pairwise_nmi_scores = pairwise_score(onp.array(predictions), method="nmi")
    
        print("\tStoring results")
        for name, scores in zip(["DISCO_TV", "DISCO_KL", "DISCO_H", "DISCO_Q", "AARI", "ANMI", "ARI"],
                               [discotec_tv, discotec_kl, discotec_hellinger, discotec_quantised,
                                pairwise_ari_scores, pairwise_nmi_scores, ari_scores]):
            for i in range(len(scores)):
                all_scores += [{
                    "Model":model,
                    "Run":i,
                    "Score": name,
                    "Dataset":dataset_name,
                    "Value":scores[i]
                }]
        
    pd.DataFrame(all_scores).to_csv(result_filename, index=False)