In [1]:
from sklearn.cluster import KMeans
import numpy as np
import sklearn
from munkres import Munkres
from sklearn.metrics import confusion_matrix

# The following functions relating to print_clustering_accuracy were modified from https://github.com/KlugerLab/SpectralNet
def calculate_cost_matrix(C, n_clusters):
   cost_matrix = np.zeros((n_clusters, n_clusters))

   for j in range(n_clusters):
      s = np.sum(C[:, j])  # number of examples in cluster i
      for i in range(n_clusters):
         t = C[i, j]
         cost_matrix[j, i] = s - t
   return cost_matrix


def get_cluster_labels_from_indices(indices):
   n_clusters = len(indices)
   clusterLabels = np.zeros(n_clusters)
   for i in range(n_clusters):
      clusterLabels[i] = indices[i][1]
   return clusterLabels


def get_y_preds(cluster_assignments, y_true, n_clusters):
   '''
   Computes the predicted labels, where label assignments now
   correspond to the actual labels in y_true (as estimated by Munkres)

   cluster_assignments:    array of labels, outputted by kmeans
   y_true:                 true labels
   n_clusters:             number of clusters in the dataset

   returns:    a tuple containing the accuracy and confusion matrix,
               in that order
   '''
   confusion_matrix = sklearn.metrics.confusion_matrix(y_true, cluster_assignments, labels=None)
   # compute accuracy based on optimal 1:1 assignment of clusters to labels
   cost_matrix = calculate_cost_matrix(confusion_matrix, n_clusters)
   indices = Munkres().compute(cost_matrix)
   kmeans_to_true_cluster_labels = get_cluster_labels_from_indices(indices)
   y_pred = kmeans_to_true_cluster_labels[cluster_assignments]
   return y_pred, confusion_matrix


def get_clustering_accuracy(cluster_assignments, y_true, n_clusters):
   '''
   Computes the accuracy based on the provided kmeans cluster assignments
   and true labels, using the Munkres algorithm

   cluster_assignments:    array of labels, outputted by kmeans
   y_true:                 true labels
   n_clusters:             number of clusters in the dataset

   returns:    a tuple containing the accuracy and confusion matrix,
               in that order
   '''
   y_pred, confusion_matrix = get_y_preds(cluster_assignments, y_true, n_clusters)
   # calculate the accuracy
   return np.mean(y_pred == y_true), confusion_matrix


def print_clustering_accuracy(cluster_assignments, y_true, n_clusters, extra_identifier='', aug_name="original"):
   """
   Convenience function: prints the accuracy
   """
   # get nmi score
   y_pred, confusion_matrix = get_y_preds(cluster_assignments, y_true, n_clusters)
   y_pred = np.squeeze(y_pred)
   y_true = np.squeeze(y_true)
   # get accuracy
   clustering_accuracy, confusion_matrix = get_clustering_accuracy(cluster_assignments, y_true, n_clusters)
   # get the confusion matrix

   return clustering_accuracy


In [2]:
from matplotlib import pyplot as plt

num_clusters = 3

for i in range(2):
    choices = np.arange(num_clusters)
    random_embeddings = np.random.uniform(0,1,(100,2))
    targets = np.random.choice(choices, (100,))

    print(random_embeddings.shape)
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(random_embeddings)
    clustering_accuracy = print_clustering_accuracy(kmeans.labels_, targets, num_clusters)

    colormap = np.arange(10)
    
    if i == 0:
        prev_labels = kmeans.labels_
        prev_centers = kmeans.cluster_centers_

plt.scatter(random_embeddings[:,0], random_embeddings[:,1], c=colormap[kmeans.labels_])
plt.show()

plt.scatter(random_embeddings[:,0], random_embeddings[:,1], c=colormap[prev_labels])
plt.show()


# print(clustering_accuracy)
# prev_labels = kmeans.labels_

# new_embeddings = np.random.uniform(0,1,(1000,10))
# kmeans = KMeans(n_clusters=10, random_state=0).fit(new_embeddings)
# clustering_accuracy = print_clustering_accuracy(kmeans.labels_, targets, 10)
# clustering_accuracy

(100, 2)
(100, 2)


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

In [3]:
kmeans.labels_

array([1, 2, 1, 0, 0, 1, 0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 2, 1, 1, 0, 2,
       1, 1, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 1, 2, 2, 2,
       2, 0, 1, 0, 0, 1, 0, 0, 2, 0, 2, 1, 0, 0, 2, 2, 1, 1, 0, 0, 0, 2,
       1, 1, 2, 0, 1, 1, 1, 2, 2, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 2, 0,
       1, 2, 2, 2, 2, 1, 1, 0, 1, 2, 1, 0], dtype=int32)

In [4]:
prev_labels

array([0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 2, 1, 1, 2, 2, 1, 0, 0, 2, 0, 1, 0,
       1, 1, 2, 0, 2, 1, 2, 0, 0, 0, 2, 2, 1, 0, 1, 0, 2, 1, 0, 1, 2, 1,
       0, 0, 1, 0, 1, 2, 0, 2, 2, 0, 0, 1, 1, 0, 2, 2, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1,
       2, 2, 2, 2, 2, 0, 2, 0, 1, 1, 2, 1], dtype=int32)

In [5]:
kmeans.cluster_centers_

array([[0.74275004, 0.62795819],
       [0.44599219, 0.20078523],
       [0.24491335, 0.76531003]])

In [6]:
prev_centers

array([[0.2559732 , 0.38546094],
       [0.77704926, 0.22768785],
       [0.66375196, 0.80656566]])

In [7]:
n = 3
c_matrix = confusion_matrix(prev_labels, kmeans.labels_)
print(c_matrix)

for col in range(n):
    n = 
    


[[20 14  9]
 [11  8  8]
 [ 7 12 11]]
