In [None]:
import pandas as pd
import sys
sys.path.append("..")
import jax.numpy as jnp
import jax
from sklearn import cluster, preprocessing
from scipy import stats
import os
from glob import glob
import warnings
warnings.filterwarnings("ignore")
import pickle
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import scienceplots

plt.style.use('science')

In [None]:
# Some important parameters
main_folder = "UCI datasets"
dataset_folder = os.path.join(main_folder, "data")
result_folder = os.path.join(main_folder, "results")
constraint_folder = os.path.join(main_folder, "constraints")


if not os.path.exists(result_folder):
    os.makedirs(result_folder)
if not os.path.exists(constraint_folder):
    os.makedirs(constraint_folder)

# Step 1 - Retrieve clustering and sample constraints

In [None]:
for dataset_file in glob(os.path.join(dataset_folder, "*_y.csv")):
    master_key = jax.random.key(0)
    
    dataset_name = dataset_file.split(os.sep)[-1][:-6]

    print(dataset_name)

    if os.path.exists(os.path.join(constraint_folder, f"{dataset_name}_regularisations.csv")):
        continue
    
    # Load targets
    targets = pd.read_csv(dataset_file).to_numpy().reshape(-1)

    # Remove datasets for which 50 constrained observations is more than half the total observations

    if len(targets)/2 <= 50 :
        continue
    
    all_regularisations = []

    # for each clustering, sample random constraints and count violations
    mix_run_offset = 0
    for clustering_file in glob(os.path.join(result_folder, f"{dataset_name}_*.pkl")):
        with open(clustering_file, "rb") as file:
            predictions = pickle.load(file)
        model = clustering_file.split("_")[-1][:-4]
        print("\tModel", model, end="\n\t")

        for n_constraint in range(5,51, 5):
            for i in range(50):
                master_key, random_key = jax.random.split(master_key)
                selected_nodes = jax.random.choice(random_key, a=len(targets), replace=False, shape=(n_constraint,))
                x_grid, y_grid = jnp.meshgrid(selected_nodes, selected_nodes)
                x_grid, y_grid = x_grid.reshape(-1), y_grid.reshape(-1)
    
                # We must remove constraints of type (i,i)
                different_indices = x_grid!=y_grid
                x_grid = x_grid[different_indices]
                y_grid = y_grid[different_indices]
    
                # Then, we just count how many mistakes were made
                violations = ((predictions[:,x_grid]==predictions[:,y_grid]) != (targets[x_grid]==targets[y_grid])).mean(1)
    
                for j in range(len(predictions)):
    
                    all_regularisations += [{
                        "Model":model,
                        "Dataset":dataset_name,
                        "Run":j,
                        "Run_constraints":i,
                        "Regularisation":violations[j],
                        "n":n_constraint
                    }]
                    all_regularisations += [{
                        "Model":"mix",
                        "Dataset":dataset_name,
                        "Run":j+mix_run_offset,
                        "Run_constraints":i,
                        "Regularisation":violations[j],
                        "n":n_constraint
                    }]
        mix_run_offset += len(predictions)
    pd.DataFrame(all_regularisations).to_csv(os.path.join(constraint_folder, f"{dataset_name}_regularisations.csv"), index=False)
    print("---")

# Report correlations

In [None]:
# We start by loading the raw scoers
scores_df = pd.concat([pd.read_csv(x) for x in glob(os.path.join(result_folder, "*.csv"))], ignore_index=True)
# Drop scores that we will not analyse
scores_df = scores_df[~scores_df.Score.isin(["DISCO_H","DISCO_TV"])]

In [None]:
# Then, we load the regularisations
regularisation_df = pd.concat([pd.read_csv(x) for x in glob(os.path.join(constraint_folder, "*.csv"))], ignore_index=True)

In [None]:
# We stitch together the dataframes by computing the new value (score+reg)
# We do not perform this operation for the external metric: the ARI
df=pd.merge(scores_df, regularisation_df, on=["Model","Dataset","Run"], how="inner")
non_ari_scores = df.Score!="ARI"
df.loc[non_ari_scores,"Value"] -= df.loc[non_ari_scores, "Regularisation"]
df=df.drop(["Regularisation"], axis=1)

In [None]:
# Finally, we must add entries when the scores were unconstrained
# That means simply concatenating vertically the dataframes
scores_df["n"] = 0
scores_df["Run_constraints"] = 0 # Dummy value
df = pd.concat([df, scores_df], axis=0, ignore_index=True)

In [None]:
# We can now evaluate the correlations of this new ranking
# We have 1 correlation value per Model/dataset/Run_constraints
df = df.pivot(columns="Score",index=["Dataset","Model","Run_constraints", "Run", "n"], values="Value").reset_index()

In [None]:
correlations_df = df.drop("Run", axis=1).groupby(["Dataset","Model","Run_constraints","n"]).corr(method="kendall")
correlations_df = correlations_df.rename(columns={"ARI":"Correlation"})["Correlation"].reset_index()

In [None]:
for model_name, subdf in correlations_df[correlations_df.Score!="ARI"].groupby("Model", as_index=False):
    print(model_name)
    sns.lineplot(data=subdf[subdf.Dataset!="lung"], x="n", y="Correlation", hue="Score")
    plt.xlabel("Constrained observations")
    plt.savefig(f"regularisation_{model_name}.pdf")
    plt.show()