In [1]:
import numpy as np
from numpy import genfromtxt
import matplotlib.pyplot as plt
import random
import sys

In [2]:
raw_digits = genfromtxt('digits-raw.csv', delimiter=',')

In [9]:
for i in range(10):
    class_i_digits = raw_digits[raw_digits[:,1]==i]
    digit = np.random.choice(len(class_i_digits))
    plt.imsave(str(i)+'.png',class_i_digits[digit][2:].reshape((28,28)),format='png', cmap='Greys', dpi=300)

In [10]:
digits_embedding = genfromtxt('digits-embedding.csv', delimiter=',')

In [None]:
visualize_egs = np.random.randint(0,len(digits_embedding), size=1000)

In [None]:
class_labels = [0,1,2,3,4,5,6,7,8,9]
for i in range(10):
    x_axis = []
    y_axis = []
    for eg in visualize_egs:
        digit_array = digits_embedding[eg]
        if digit_array[1] == i:
            x_axis.append(digit_array[2])
            y_axis.append(digit_array[3])
    plt.scatter(x_axis, y_axis, label=i)
plt.legend(class_labels, loc='best', fontsize=8, bbox_to_anchor=(1, 1))
plt.show()

In [None]:
def get_squared_distance(p1, p2):
    sq_dist = 0
    for i in range(len(p1)):
        sq_dist +=(p1[i]-p2[i])**2
    return sq_dist

In [None]:
def get_cluster_id(centroids, feature):
    min_dist = float("inf")
    min_centroid = -1
    for i,centroid in enumerate(centroids):
        dist = get_squared_distance(feature, centroid)
        if dist < min_dist:
            min_centroid = i
            min_dist = dist
    
    return min_centroid     

In [None]:
def run_kmeans(features, k, seed_value):
    np.random.seed(seed_value)
    centroids = []
    centroid_indices = np.random.randint(0,len(features), size=k)
    for index in centroid_indices:
        centroids.append(features[index])

    cluster_indices = np.zeros(len(features), dtype=np.int8)
    '''
    repeat till 50 iterations
    '''
    num_iterations = 50
    for iteration in range(num_iterations):
    #     print (centroids)
        for i,feature in enumerate(features):
            cluster_indices[i]=get_cluster_id(centroids, feature)

        mean_x_centroids = np.zeros(k)
        mean_y_centroids = np.zeros(k)
        count_points_clusters = np.zeros(k)
        for i, feature in enumerate(features):
            mean_x_centroids[cluster_indices[i]] += feature[0]
            mean_y_centroids[cluster_indices[i]] += feature[1]
            count_points_clusters[cluster_indices[i]] += 1
        mean_x_centroids/=count_points_clusters
        mean_y_centroids/=count_points_clusters
        for i in range(len(centroids)):
            centroids[i]=[mean_x_centroids[i], mean_y_centroids[i]]
    return cluster_indices, centroids

In [None]:
def get_wc_ssd(centroids, features, cluster_indices):
    wc_ssd=0
    for i, feature in enumerate(features):
        centroid = centroids[cluster_indices[i]]
        wc_ssd += get_squared_distance(feature, centroid)
    return wc_ssd

In [None]:
'''
Calculate silhoutte coefficient
'''
def get_SC(features, cluster_indices):
    s_i_list = np.zeros(len(features))

    features_norm = np.linalg.norm(features, axis=1)**2
    distance_squared = features_norm.reshape(-1,1) + features_norm.reshape(1,-1) - 2*np.dot(features, features.T)
    distance_squared[distance_squared<0]=0
    distance_matrix = np.sqrt(distance_squared)
    
    for i, distance_i in enumerate(distance_matrix):
        same_cluster_distances = distance_i[cluster_indices==cluster_indices[i]]
        A=np.sum(same_cluster_distances)/(len(same_cluster_distances)-1)

        diff_cluster_distances = distance_i[cluster_indices!=cluster_indices[i]]
        B=np.sum(diff_cluster_distances)/(len(diff_cluster_distances))

        s_i_list[i]=(B-A)/max(A,B)

    SC = np.sum(s_i_list)/len(s_i_list)
    return SC

In [None]:
def get_entropy(labels):
    elements, counts = np.unique(labels, return_counts = True)
    counts=counts/len(labels)
    return elements, -np.sum(counts*np.log(counts))

In [None]:
def get_NMI(features, cluster_indices, class_labels):
    class_unique_labels, class_entropy = get_entropy(class_labels)
    cluster_unique_labels, cluster_entropy = get_entropy(cluster_indices)
    '''
    calculate conditional entropy for class labels given cluster
    '''
    conditional_entropy = 0
    for cluster_id in cluster_unique_labels:
        class_filter = class_labels[cluster_indices==cluster_id]
        class_filter_labels, class_filter_entropy = get_entropy(class_filter)
#         print (len(class_filter))
        conditional_entropy += len(class_filter)*class_filter_entropy
    conditional_entropy /= len(class_labels)
    mutual_information = class_entropy - conditional_entropy
    nmi = mutual_information/(class_entropy+cluster_entropy)
    return nmi

In [None]:
features = digits_embedding[:,2:4]
k=10
seed = 0
cluster_indices, centroids = run_kmeans(features, k, seed)
print(get_wc_ssd(centroids, features, cluster_indices))
print(get_SC(features, cluster_indices))
print(get_NMI(features, cluster_indices, digits_embedding[:,1]))

In [None]:
digits_2 = np.array([2,4,6,7])
digits_3 = [6,7]
digits_embedding_2 = digits_embedding[np.in1d(digits_embedding[:,1], digits_2)]
digits_embedding_3 = digits_embedding[np.in1d(digits_embedding[:,1], digits_3)]

In [None]:
digits_embedding_3[:,1].shape

In [None]:
digits_embedding[:,1]==2