Courtesy to https://github.com/deric/clustering-benchmark

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import urllib.request
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.preprocessing import StandardScaler

from pycvi.cluster import generate_all_clusterings
from pycvi.cvi import Inertia, GapStatistic, ScoreFunction, Hartigan, Diameter, CalinskiHarabasz, Silhouette, CVIs
from pycvi.compute_scores import compute_all_scores
import warnings
warnings.filterwarnings("ignore")

from utils import (
    arff_from_github, load_data_from_github, get_data_labels ,URL_ROOT,
    write_list_datasets, UNIMODAL, UNLABELED, INVALID, UNKNOWN_K,
    print_heads, TOO_MANY_LABELS, TOO_MANY_SAMPLES, INVALID
)

In [3]:
DATA_SOURCE = "artificial"
#DATA_SOURCE = "real-world"
PATH = f"{URL_ROOT}{DATA_SOURCE}/"
RES_DIR = f'../res/{DATA_SOURCE}/'

In [4]:
fname = f'{URL_ROOT}{DATA_SOURCE}.txt'

all_datasets = []
for line in urllib.request.urlopen(fname):
    all_datasets.append(line.decode('utf-8').strip())

print(len(all_datasets))
all_datasets


122


['mopsi-finland.arff',
 'long3.arff',
 'zelnik1.arff',
 'insect.arff',
 'disk-4500n.arff',
 'disk-4000n.arff',
 'cure-t0-2000n-2D.arff',
 'circle.arff',
 'zelnik6.arff',
 'elly-2d10c13s.arff',
 'zelnik3.arff',
 'disk-4600n.arff',
 'aggregation.arff',
 'donutcurves.arff',
 'dartboard1.arff',
 's-set3.arff',
 'disk-1000n.arff',
 'complex8.arff',
 'ds4c2sc8.arff',
 'square4.arff',
 'square5.arff',
 'dpb.arff',
 'dense-disk-3000.arff',
 'dpc.arff',
 'DS-577.arff',
 'cluto-t5-8k.arff',
 'jain.arff',
 '2dnormals.arff',
 'target.arff',
 'cluto-t4-8k.arff',
 'pmf.arff',
 'blobs.arff',
 'banana.arff',
 'sizes2.arff',
 '2d-3c-no123.arff',
 'sizes5.arff',
 'pathbased.arff',
 'cure-t2-4k.arff',
 'triangle2.arff',
 'square2.arff',
 'hypercube.arff',
 '2d-4c-no9.arff',
 'twenty.arff',
 'hepta.arff',
 'smile3.arff',
 'spherical_4_3.arff',
 'golfball.arff',
 'dense-disk-5000.arff',
 'twodiamonds.arff',
 'birch-rg3.arff',
 'donut2.arff',
 'cuboids.arff',
 'elliptical_10_2.arff',
 'sizes3.arff',
 'D31.a

## Save list and headers of all original datasets

In [5]:
list_fname = f'{RES_DIR}all_datasets-{DATA_SOURCE}.txt'
#write_list_datasets(list_fname, all_datasets)

In [None]:
#print_heads(fnames=all_datasets, path=PATH)

### Containing multimodal and labeled datasets

In [6]:
filenames = [
    fname for fname in all_datasets
    if (fname not in UNIMODAL+UNLABELED)]
print(len(filenames))
filenames

114


['long3.arff',
 'zelnik1.arff',
 'insect.arff',
 'disk-4500n.arff',
 'disk-4000n.arff',
 'cure-t0-2000n-2D.arff',
 'circle.arff',
 'zelnik6.arff',
 'elly-2d10c13s.arff',
 'zelnik3.arff',
 'disk-4600n.arff',
 'aggregation.arff',
 'donutcurves.arff',
 'dartboard1.arff',
 'disk-1000n.arff',
 'complex8.arff',
 'ds4c2sc8.arff',
 'square4.arff',
 'square5.arff',
 'dpb.arff',
 'dense-disk-3000.arff',
 'dpc.arff',
 'DS-577.arff',
 'cluto-t5-8k.arff',
 'jain.arff',
 '2dnormals.arff',
 'target.arff',
 'cluto-t4-8k.arff',
 'pmf.arff',
 'blobs.arff',
 'banana.arff',
 'sizes2.arff',
 '2d-3c-no123.arff',
 'sizes5.arff',
 'pathbased.arff',
 'cure-t2-4k.arff',
 'triangle2.arff',
 'square2.arff',
 'hypercube.arff',
 '2d-4c-no9.arff',
 'twenty.arff',
 'hepta.arff',
 'smile3.arff',
 'spherical_4_3.arff',
 'dense-disk-5000.arff',
 'twodiamonds.arff',
 'donut2.arff',
 'cuboids.arff',
 'elliptical_10_2.arff',
 'sizes3.arff',
 'D31.arff',
 'compound.arff',
 'long1.arff',
 'long2.arff',
 'curves2.arff',
 'dis

In [None]:
# print_heads(fnames=filenames, path=PATH)

## Save list of datasets that are suitable for experiments

In [None]:
not_for_exp = list(set(UNKNOWN_K+TOO_MANY_LABELS+TOO_MANY_SAMPLES+INVALID))
fname_exp_theory = [
    fname for fname in all_datasets
    if (fname not in not_for_exp)]
print(len(fname_exp_theory))
fname_exp_theory

In [None]:
exp_theory_list_fname = f'{RES_DIR}datasets_experiments_theory-{DATA_SOURCE}.txt'
#write_list_datasets(exp_theory_list_fname, fname_exp_theory)

In [None]:
excluded = [f for f in all_datasets if f in not_for_exp]
excluded_list_fname = f'{RES_DIR}datasets_excluded-{DATA_SOURCE}.txt'
#write_list_datasets(excluded_list_fname, excluded)

## Saving data and labels 

In [7]:
dataset_names = ["zelnik1", "target", "long1", "xclara", "banana", "diamond9"]

for d in dataset_names:
    data, labels, n_labels, meta = get_data_labels(
        f"{URL_ROOT}artificial/{d}.arff", path=""
    )
    # labels = labels.astype(float)
    pd.DataFrame(labels).to_csv(f"./{d}_labels.csv", header=False, index=False)
    pd.DataFrame(data).to_csv(f"./{d}_data.csv", header=False, index=False)

## Using one example

In [None]:
data, meta = arff_from_github(f'{URL_ROOT}artificial/diamond9.arff')
df = pd.DataFrame(data)
df.plot.scatter("x", "y")
df

In [None]:
df.dtypes
[t == "int" for t in df.dtypes]

In [None]:
def plot_clusters_old(data, clusterings, titles):
    fig, axes = plt.subplots(
        nrows=2, ncols=4, sharex=True, sharey=True, figsize=(15,10),
        tight_layout=True
    )
    for i, ax in enumerate(axes.flat[:len(clusterings)]):
        # Plot the clustering selected by a given score
        for i_label, cluster in enumerate(clusterings[i]):
            ax.scatter(data[cluster, 0], data[cluster, 1], s=0.5)
        ax.set_title(str(titles[i]))
    return fig, ax

def plot_clusters(data, clusterings, titles):
    # Some datasets are in 3D
    (N, d) = data.shape
    if d == 2:
        fig, axes = plt.subplots(
            nrows=2, ncols=4, sharex=True, sharey=True, figsize=(15,10),
            tight_layout=True
        )
    elif d == 3:
        fig = plt.figure(figsize=(15,10), tight_layout=True)
    # Plot the clustering selected by a given score
    for i in range(len(clusterings)):
        # Plot clusters one by one
        if d == 2:
            ax = axes.flat[i]
        elif d == 3:
            ax = fig.add_subplot(2, 4, i+1, projection='3d')
        for i_label, cluster in enumerate(clusterings[i]):
            if d == 2:
                ax.scatter(data[cluster, 0], data[cluster, 1], s=0.5)
            elif d == 3:
                ax.scatter(
                    data[cluster, 0], data[cluster, 1], data[cluster, 2], s=0.5
                )
        ax.set_title(str(titles[i]))
    return fig

In [None]:
df_bis = df.iloc[:, 0:-1]
df_bis

In [None]:
X = df_bis.to_numpy()
X

In [None]:
n_clusters_range = [i for i in range(15)]

In [None]:
l_data = []
l_n_labels = []
l_fname = [
    # "diamond9.arff",
    "tetra.arff"
    # "xclara.arff",
    # "birch-rg1.arff",
    # "golfball.arff",
]
for fname in l_fname:
    if fname in UNLABELED:
        with_labels = False
        n_labels = 1
    else:
        with_labels = True
    data, labels, meta = load_data_from_github(
        PATH + fname, with_labels=with_labels
    )
    if with_labels:
        n_labels = len(np.unique(labels))
    l_data.append(data)
    l_n_labels.append(n_labels)
    print(len(data), n_labels)


In [None]:
def experiment(X):
    N = len(X)

    clusterings = generate_all_clusterings(
            X,
            AgglomerativeClustering,
            n_clusters_range,
            DTW=False,
            scaler=StandardScaler(),
        )
    selected_clusterings = []

    for s in CVIs:
        score = s()
        print(" ================ {} ================ ".format(str(score)))
        if N > 10000 and s in [GapStatistic, Silhouette]:
            print("Dataset too big for {}".format(score))
        else:
            scores = compute_all_scores(
                score,
                X,
                clusterings,
                DTW=False,
                scaler=StandardScaler(),
            )

            for k in n_clusters_range:
                print(k, scores[0][k])

            selected_k = score.select(scores)[0]
            selected_clusterings.append(clusterings[0][selected_k])
            print("Selected k {}".format(selected_k))

    fig = plot_clusters(X, selected_clusterings, CVIs)
    fig.savefig("./tmp")


In [None]:
for i, X in enumerate(l_data):
    print(" ---------------- DATASET {} ---------------- ".format(l_fname[i]))
    print(" --------------------- True k: {} --------------------- ".format(l_n_labels[i]))
    experiment(X)