In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score, fowlkes_mallows_score, normalized_mutual_info_score
from sklearn.metrics import jaccard_score, f1_score
from sklearn.datasets import make_blobs
import numpy as np
import pandas as pd
from tabulate import tabulate
from IPython.display import display, HTML

# Set up parameters for the experiment
num_samples = 1000
num_clusters_list = [2, 3, 4, 5, 6]
num_metrics = 6

# Initialize empty array to hold evaluation scores
scores = np.zeros((len(num_clusters_list), num_metrics))

# Generate data and fit k-means for each number of clusters
for i, num_clusters in enumerate(num_clusters_list):
    X, y = make_blobs(n_samples=num_samples, centers=num_clusters, random_state=42)
    kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(X)
    
    # Compute clustering evaluation scores
    ari = adjusted_rand_score(y, kmeans.labels_)
    fm = fowlkes_mallows_score(y, kmeans.labels_)
    nmi = normalized_mutual_info_score(y, kmeans.labels_)
    jacc = jaccard_score(y, kmeans.labels_, average='macro')
    f1 = f1_score(y, kmeans.labels_, average='macro')
    purity = sum([max(np.bincount(y[kmeans.labels_ == j])) for j in range(num_clusters)]) / len(X)
    
    # Store scores in array
    scores[i, :] = [ari, f1, fm, jacc, nmi, purity]

# Create table of evaluation scores
scores_df = pd.DataFrame(scores, index=num_clusters_list, columns=["Adjusted Rand Index", "F-Measure", "Fowlkes-Mallows Index", "Jaccard Index", "Normalized Mutual Information", "Purity"])

# Display table of evaluation scores
display(scores_df)


Unnamed: 0,Adjusted Rand Index,F-Measure,Fowlkes-Mallows Index,Jaccard Index,Normalized Mutual Information,Purity
2,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,0.333333,1.0,0.333333,1.0,1.0
4,0.997331,0.500998,0.997996,0.5005,0.995296,0.999
5,0.951966,0.590072,0.961535,0.580995,0.946858,0.98
6,0.88337,0.166667,0.902745,0.166667,0.901056,0.946


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score, normalized_mutual_info_score, silhouette_score, davies_bouldin_score
from sklearn.datasets import make_blobs

import pandas as pd
from IPython.display import display

# Generate random data with 4 clusters
X, y = make_blobs(n_samples=1000, centers=4, random_state=42)

# Set up evaluation metrics
metrics = {
    'BetaCV': calinski_harabasz_score,
    'Normalized Cut': normalized_mutual_info_score,
    'Dunn Index': None,
    'Davies-Bouldin Index': davies_bouldin_score,
    'Silhouette Coefficient': silhouette_score,
}

# Calculate evaluation metrics for different number of clusters
results = []
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)

    # Calculate evaluation metrics
    metrics_values = []
    for metric_name, metric_function in metrics.items():
        if metric_function is None:
            # Calculate Dunn Index
            intra_cluster_distances = [((X[labels == i] - kmeans.cluster_centers_[i])**2).sum(axis=1).mean() for i in range(k)]
            min_inter_cluster_distance = min([((kmeans.cluster_centers_[i] - kmeans.cluster_centers_[j])**2).sum() for i in range(k) for j in range(i+1, k)])
            metrics_values.append(min_inter_cluster_distance / max(intra_cluster_distances))
        elif metric_name == 'Normalized Cut':
            # Calculate Normalized Cut
            metrics_values.append(metric_function(y, labels))
        else:
            # Calculate other metrics
            metrics_values.append(metric_function(X, labels))

    results.append((k,) + tuple(metrics_values))

# Display results in a table
df = pd.DataFrame(results, columns=['k'] + list(metrics.keys()))
display(df)


Unnamed: 0,k,BetaCV,Normalized Cut,Dunn Index,Davies-Bouldin Index,Silhouette Coefficient
0,2,1117.214107,0.57716,4.550972,0.528739,0.589579
1,3,4055.190249,0.857143,11.465379,0.360277,0.752196
2,4,11066.971284,0.995296,21.186364,0.29233,0.791598
3,5,9161.638549,0.936751,1.457711,0.676767,0.65294
4,6,8151.879392,0.884784,1.438138,0.900089,0.531362
