In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import spatial
import math
from collections import Counter
from sklearn.metrics import multilabel_confusion_matrix
import matplotlib.pyplot as plt

In [47]:
data = pd.read_csv("./kmeans_data/data.csv",header = None)
labels = pd.read_csv("./kmeans_data/label.csv", header = None, names=['label'])

In [3]:

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data.shape

(10000, 784)

In [48]:
labels.head()

Unnamed: 0,label
0,7
1,2
2,1
3,0
4,4


In [90]:
labels.shape

(10000, 1)

In [91]:
train_x = data.iloc[0:8000,:]
train_y = labels.iloc[0:8000,:]
test_x = data.iloc[8000:10000,:]
test_y = labels.iloc[8000:10000,:]

In [92]:
def computeSSE(C, S, data):
    '''
    Input : C -> Centroids
            S -> Set of Indicies corresponding to Centroid C
            data -> Data used to form clusters
    Output : Returns the Error sum of squares
    '''
    SSE = 0
    for cluster in C:
        c = C[cluster]
        for point in S[cluster]:
            x_i = data.iloc[point]
            for a,b in zip(x_i, c):
                SSE += math.pow(a-b,2)
    return SSE

In [93]:
def getClusterLabels(C, S, labels):
    '''
    Input : C -> Centroids
            S -> Set of Indicies corresponding to Centroid C
            data -> Data used to form clusters
    Output : Returns an array of size K having labels based on majority voting in the cluster
    '''
    cluster_labels = np.zeros(10,dtype=int)
    for c in C:
        labels_of_points = []
        for point in S[c]:
            labels_of_points.extend(labels.iloc[point])
        counter = Counter(labels_of_points)
        print(labels_of_points)
        cluster_labels[c] = max(counter, key=counter.get)
    return cluster_labels
            

In [94]:
def jaccard_similarity(x1, x2):
    '''
        Returns Jaccard Similarity
    '''
    s1 = set(x1)
    s2 = set(x2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))
    

In [95]:
def Jaccard_dist(x1,x2):
    intersection_x1_x2 = np.logical_and(x1,x2)
    union_x1_x2 = np.logical_or(x1,x2)
    dist = intersection_x1_x2.sum() / float(union_x1_x2.sum())
    return dist
    

In [96]:
def kmeans(data,metric, tol=0.001, K = 10):
    '''
        Returns 
        C: Centroids
        S: Sets corresponding to each cluster C
    '''
    centroids = {}
    # Initialization
    for i in range(K):
        centroids[i] = data.iloc[i]
    iterations = 0
    Converged = False
    classifications = {}
    while(not Converged):
        for i in range(K):
            classifications[i] = []
        for index_i in range(data.shape[0]):
            featureset = data.iloc[index_i]
            if metric == 'euclidean':
                distances = [np.linalg.norm(featureset - centroids[centroid]) for centroid in centroids]
#                 print("Dist", distances)
                classification = distances.index(min(distances))
                classifications[classification].append(index_i)
            elif metric == 'cosine':
                similarity = [1 - spatial.distance.cosine(featureset, centroids[centroid]) for centroid in centroids]
                classification = similarity.index(max(similarity))
                classifications[classification].append(index_i)
            elif metric == 'jaccard':
                similarity = [jaccard_similarity(featureset, centroids[centroid]) for centroid in centroids]
                classification = similarity.index(max(similarity))
                classifications[classification].append(index_i)
        
        prev_centroids = dict(centroids)
        
        
        # Getting updating centroids by computing mean
        for classification in classifications:
            points_around_centroid = classifications[classification]
            centroids[classification] = np.average( data.iloc[points_around_centroid], axis = 0)
        
        curr_tol = -1;
        
        # Checking for convergence
        for c in centroids:
            original_centroid = prev_centroids[c]
            current_centroid  = centroids[c]
            curr_diff = np.sum(np.absolute(current_centroid - original_centroid))
            curr_tol = max(curr_tol, curr_diff)
        iterations += 1
        print("Iteration : {}, Curr_tol : {}".format(iterations, curr_tol))
        if(curr_tol < 10):
            converged = True
            break
            
    print('\n')
    return centroids, classifications

In [97]:
def getAccuracy(centroids, centroid_Labels, test_data, true_labels, metric = 'euclidean'):
    y_true = list(true_labels['label']);
    y_pred = []
    for index in range(test_data.shape[0]):
        featureset = test_data.iloc[index]
        if metric == 'euclidean':
            distances = [np.linalg.norm(featureset - centroids[centroid]) for centroid in centroids]
            classification = distances.index(min(distances))
            y_pred.append(centroid_Labels[classification])
        elif metric == 'cosine':
            similarity = [1 - spatial.distance.cosine(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification])
        elif metric == 'jaccard':
            similarity = [jaccard_similarity(featureset, centroids[centroid]) for centroid in centroids]
            classification = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[classification])
    denominator = test_data.shape[0]
    correctly_classified = 0
    for i in range(0,len(y_pred)):
        if y_true[i] == y_pred[i]:
            correctly_classified += 1
    acc = correctly_classified/denominator
    return acc
    

In [98]:
centroids, classifications = kmeans(train_x, 'euclidean')

Iteration : 1, Curr_tol : 27544.86666666667
Iteration : 2, Curr_tol : 5151.442173112338
Iteration : 3, Curr_tol : 3867.3438566062878
Iteration : 4, Curr_tol : 4355.750446735659
Iteration : 5, Curr_tol : 4454.348604403103
Iteration : 6, Curr_tol : 3616.5576836917567
Iteration : 7, Curr_tol : 1760.9519917455018
Iteration : 8, Curr_tol : 1139.760954557415
Iteration : 9, Curr_tol : 2022.466044495555
Iteration : 10, Curr_tol : 2957.2050951939336
Iteration : 11, Curr_tol : 2269.961484665858
Iteration : 12, Curr_tol : 1550.728894345346
Iteration : 13, Curr_tol : 1082.3005458289736
Iteration : 14, Curr_tol : 613.6778094721162
Iteration : 15, Curr_tol : 544.8479052152958
Iteration : 16, Curr_tol : 498.41007611178236
Iteration : 17, Curr_tol : 544.6881784267813
Iteration : 18, Curr_tol : 448.47122692725316
Iteration : 19, Curr_tol : 454.66326134061893
Iteration : 20, Curr_tol : 505.13219612331307
Iteration : 21, Curr_tol : 306.8271794963575
Iteration : 22, Curr_tol : 327.0434829821017
Iteration 

In [99]:
print("SSE of Euclidean is", computeSSE(centroids, classifications, train_x))

SSE of Euclidean is 20071133847.747944


In [100]:
cluster_labels_euclidean = getClusterLabels(centroids, classifications, train_y)
cluster_labels_euclidean

[4, 9, 9, 7, 4, 7, 7, 4, 9, 4, 7, 7, 7, 9, 4, 9, 4, 9, 7, 9, 4, 4, 4, 4, 9, 4, 9, 9, 7, 4, 7, 7, 4, 7, 4, 9, 7, 7, 7, 7, 9, 9, 7, 9, 7, 4, 7, 9, 7, 3, 7, 9, 7, 4, 3, 7, 9, 9, 7, 7, 9, 7, 9, 9, 5, 9, 7, 5, 9, 9, 4, 4, 4, 9, 9, 4, 8, 7, 9, 9, 4, 4, 9, 3, 7, 4, 9, 9, 4, 9, 7, 7, 7, 2, 4, 9, 4, 7, 7, 3, 7, 7, 9, 4, 4, 4, 7, 4, 7, 4, 7, 4, 9, 9, 4, 7, 9, 4, 9, 7, 7, 2, 7, 9, 7, 7, 9, 4, 4, 7, 4, 7, 7, 8, 5, 4, 9, 4, 9, 4, 7, 9, 9, 9, 7, 7, 9, 4, 9, 7, 3, 7, 4, 4, 4, 9, 9, 9, 9, 7, 4, 5, 4, 7, 8, 3, 7, 4, 8, 9, 7, 7, 9, 7, 7, 9, 3, 3, 7, 9, 9, 7, 9, 4, 4, 4, 4, 7, 9, 9, 5, 4, 4, 9, 9, 4, 7, 4, 7, 4, 9, 7, 2, 7, 7, 3, 9, 7, 4, 9, 9, 4, 9, 4, 9, 4, 7, 9, 5, 7, 7, 7, 9, 9, 7, 7, 4, 4, 9, 9, 7, 4, 7, 9, 7, 9, 7, 4, 5, 7, 9, 9, 9, 4, 4, 9, 2, 7, 4, 7, 9, 7, 9, 4, 9, 7, 9, 7, 7, 7, 7, 4, 8, 9, 9, 8, 4, 4, 9, 4, 9, 7, 9, 9, 7, 9, 9, 7, 4, 7, 4, 7, 9, 4, 4, 4, 9, 8, 4, 9, 9, 5, 4, 7, 7, 4, 9, 7, 5, 7, 9, 4, 4, 7, 4, 5, 7, 7, 9, 5, 4, 4, 9, 4, 9, 4, 4, 9, 7, 9, 4, 9, 7, 8, 4, 9, 9, 7, 4, 5, 7, 9, 4, 

[4, 5, 9, 6, 4, 7, 4, 4, 4, 6, 4, 9, 7, 4, 8, 4, 4, 9, 4, 9, 9, 4, 7, 9, 9, 4, 9, 9, 4, 9, 9, 4, 9, 4, 9, 4, 8, 4, 2, 9, 9, 6, 4, 4, 4, 4, 4, 4, 4, 9, 4, 7, 8, 9, 9, 9, 7, 4, 4, 4, 4, 4, 9, 3, 9, 9, 7, 9, 9, 9, 7, 8, 4, 4, 4, 4, 7, 9, 4, 4, 4, 4, 4, 4, 8, 4, 5, 4, 4, 4, 7, 4, 4, 9, 4, 9, 7, 9, 4, 9, 6, 4, 4, 4, 6, 4, 2, 9, 4, 4, 4, 9, 7, 9, 3, 9, 4, 4, 4, 4, 4, 8, 7, 0, 5, 4, 4, 4, 4, 6, 5, 5, 9, 4, 9, 4, 8, 9, 4, 2, 4, 6, 9, 4, 4, 9, 9, 4, 5, 4, 4, 9, 9, 6, 9, 9, 4, 9, 9, 4, 4, 4, 4, 7, 4, 9, 4, 6, 4, 9, 4, 9, 9, 5, 9, 4, 9, 9, 7, 4, 6, 4, 7, 4, 2, 4, 7, 4, 4, 7, 4, 9, 8, 9, 2, 9, 6, 9, 9, 4, 9, 9, 3, 6, 9, 7, 7, 4, 2, 4, 5, 4, 4, 4, 4, 9, 5, 4, 4, 9, 5, 7, 6, 9, 4, 4, 9, 7, 7, 3, 6, 4, 6, 4, 4, 9, 4, 9, 4, 4, 9, 9, 8, 7, 4, 4, 4, 9, 9, 9, 4, 9, 9, 7, 4, 9, 4, 4, 4, 9, 9, 5, 4, 4, 9, 4, 4, 4, 9, 4, 4, 5, 4, 9, 4, 9, 2, 4, 4, 7, 9, 6, 7, 7, 9, 4, 4, 2, 9, 4, 4, 7, 9, 5, 9, 7, 9, 5, 5, 7, 4, 7, 9, 9, 9, 4, 4, 4, 9, 4, 4, 9, 5, 6, 3, 9, 9, 4, 9, 2, 4, 9, 8, 7, 4, 4, 6, 2, 9, 4, 9, 7, 4, 

[2, 8, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 8, 2, 2, 7, 2, 2, 2, 2, 2, 2, 2, 2, 9, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 8, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 8, 2, 2, 2, 2, 2, 8, 2, 2, 6, 7, 2, 8, 3, 2, 2, 2, 6, 2, 6, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 2, 

array([9, 3, 1, 0, 4, 1, 8, 6, 2, 7])

In [101]:
acc= getAccuracy(centroids, cluster_labels_euclidean,test_x,test_y)
print("Accuracy of K-Means using Euclidean Distance :", acc)

Accuracy of K-Means using Euclidean Distance : 0.634


In [102]:
centroids_cosine, classifications_cosine = kmeans(train_x, 'cosine')

Iteration : 1, Curr_tol : 27715.949602122015
Iteration : 2, Curr_tol : 7383.536765064848
Iteration : 3, Curr_tol : 4017.2317517005645
Iteration : 4, Curr_tol : 3458.5587410242247
Iteration : 5, Curr_tol : 2289.1504229017564
Iteration : 6, Curr_tol : 1151.978544382016
Iteration : 7, Curr_tol : 695.704718693285
Iteration : 8, Curr_tol : 567.8039816772375
Iteration : 9, Curr_tol : 427.8707334009936
Iteration : 10, Curr_tol : 465.0733247276356
Iteration : 11, Curr_tol : 430.0185326795915
Iteration : 12, Curr_tol : 338.05455264405396
Iteration : 13, Curr_tol : 394.88388600463463
Iteration : 14, Curr_tol : 331.49145055571364
Iteration : 15, Curr_tol : 305.9008140042623
Iteration : 16, Curr_tol : 222.15094286833428
Iteration : 17, Curr_tol : 157.4129986553116
Iteration : 18, Curr_tol : 148.10398144117556
Iteration : 19, Curr_tol : 134.85761702371883
Iteration : 20, Curr_tol : 139.98329710156534
Iteration : 21, Curr_tol : 104.2269026480673
Iteration : 22, Curr_tol : 82.7927018567323
Iteration 

In [103]:
print("SSE of Cosine is", computeSSE(centroids_cosine, classifications_cosine, train_x))

SSE of Cosine is 20326625940.722775


In [104]:
cluster_labels_cosine = getClusterLabels(centroids_cosine, classifications_cosine, train_y)
cluster_labels_cosine

[7, 7, 7, 7, 7, 7, 2, 7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 3, 7, 9, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 7, 2, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 2, 9, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 5, 7, 9, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 7, 7, 7, 8, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 7, 7, 7, 8, 7, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 7, 7, 2, 7, 2, 9, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 2, 7, 9, 7, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 7, 3, 9, 7, 9, 9, 7, 7, 7, 7, 7, 7, 8, 2, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 7, 1, 3, 1, 1, 1, 1, 2, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 7, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 7, 1, 1, 4, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 5, 6, 1, 1, 1, 1, 6, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 3, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 3, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 9, 1, 1, 1, 1, 7, 1, 1, 1, 1, 9, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

[9, 9, 7, 5, 8, 3, 7, 9, 9, 9, 4, 4, 4, 7, 5, 5, 5, 7, 5, 6, 5, 8, 8, 5, 9, 5, 4, 6, 5, 4, 8, 4, 9, 4, 5, 8, 4, 5, 3, 8, 7, 5, 0, 5, 7, 4, 8, 4, 9, 8, 7, 5, 9, 4, 9, 8, 5, 5, 4, 5, 9, 4, 4, 7, 5, 9, 4, 5, 6, 7, 4, 7, 2, 9, 6, 4, 4, 8, 4, 9, 9, 8, 7, 8, 5, 4, 4, 4, 4, 5, 5, 7, 0, 4, 9, 8, 7, 6, 9, 7, 0, 8, 6, 7, 9, 0, 8, 9, 4, 9, 8, 5, 7, 9, 5, 4, 4, 4, 7, 4, 4, 0, 9, 4, 9, 9, 3, 8, 5, 8, 5, 7, 4, 7, 3, 4, 8, 4, 1, 5, 8, 5, 7, 9, 7, 4, 8, 4, 9, 5, 8, 4, 7, 4, 5, 8, 5, 7, 6, 4, 4, 9, 5, 9, 9, 8, 4, 4, 7, 7, 9, 7, 7, 9, 4, 4, 9, 7, 7, 5, 7, 9, 7, 8, 5, 9, 5, 5, 9, 5, 4, 9, 9, 4, 7, 5, 0, 7, 9, 9, 9, 8, 4, 4, 7, 9, 5, 9, 4, 9, 8, 5, 5, 0, 4, 8, 9, 5, 8, 4, 4, 8, 4, 5, 7, 5, 5, 4, 7, 4, 8, 7, 0, 6, 8, 6, 7, 8, 5, 5, 5, 4, 6, 0, 4, 4, 8, 4, 4, 7, 0, 7, 9, 3, 9, 9, 7, 4, 5, 9, 5, 8, 5, 7, 5, 5, 8, 8, 8, 5, 4, 4, 9, 8, 8, 9, 3, 5, 7, 2, 5, 5, 5, 5, 9, 9, 5, 5, 9, 5, 5, 5, 9, 0, 5, 9, 4, 5, 4, 8, 5, 8, 5, 7, 6, 5, 9, 5, 8, 5, 5, 9, 5, 5, 9, 4, 9, 9, 9, 4, 9, 8, 7, 4, 5, 4, 7, 7, 2, 4, 5, 8, 5, 

array([7, 3, 1, 0, 4, 1, 8, 9, 2, 4])

In [105]:
acc= getAccuracy(centroids_cosine, cluster_labels_cosine,test_x,test_y, metric = 'cosine' )
print("Accuracy of K-Means using Cosine Distance :", acc)

Accuracy of K-Means using Cosine Distance : 0.583


In [111]:
# [DUE TO VERY HIGH TRAINING TIME and my laptop started crashing I have interruped the training and after many 
# experimentation k-means with Jaccard, i felt Jaccard K-means is not good with this dataset.
centroids_jaccard, classification_jaccard = kmeans(train_x, 'jaccard',tol=8000)

In [106]:
print("SSE of Jaccard is ", , computeSSE(centroids_jaccard, classification_jaccard, train_x))


SSE of Jaccard is 3250671196.23207


In [107]:
cluster_labels_jaccard= getClusterLabels(centroids_jaccard, classification_jaccard, train_y)

In [109]:
acc= getAccuracy(centroids_jaccard, cluster_labels_jaccard,test_x,test_y, metric = 'cosine' )
print("Accuracy of K-Means using Jaccard Similarity: ",acc)

Accuracy of K-Means using Jaccard Similarity: 0.50110102
