In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('bagOfWordsdunn.csv')
pca = PCA(2)
df = pca.fit_transform(data)

In [3]:
k = 5
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
labels = kmeans.fit_predict(df)
centroids = kmeans.cluster_centers_

In [4]:
def euclidean_distance(point1, point2):
    return np.linalg.norm(point1 - point2)

def calc_dunn_index(data, labels, centroids):
    unique_clusters = np.unique(labels)
    inter_cluster_distances = []
    intra_cluster_distances = []

    for i in range(len(unique_clusters)):
        for j in range(i + 1, len(unique_clusters)):
            dist = euclidean_distance(centroids[i], centroids[j])
            inter_cluster_distances.append(dist)

    for cluster in unique_clusters:
        cluster_points = data[labels == cluster]
        if len(cluster_points) > 1:
            for i in range(len(cluster_points)):
                for j in range(i + 1, len(cluster_points)):
                    dist = euclidean_distance(cluster_points[i], cluster_points[j])
                    intra_cluster_distances.append(dist)

    min_inter_cluster_distance = min(inter_cluster_distances)
    max_intra_cluster_distance = max(intra_cluster_distances)

    dunn_index = min_inter_cluster_distance / max_intra_cluster_distance
    return dunn_index

dunn_index = calc_dunn_index(df, labels, centroids)
print(f"Dunn Index: {dunn_index}")



Dunn Index: 0.3270904161703382
