In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics import multilabel_confusion_matrix
import scipy 
from scipy import spatial
import sklearn
from collections import Counter


In [18]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data.csv')
labels = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/label.csv', header = None, names =['label'])

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split( data, test_size=0.20, random_state=45)
train_labels, test_labels = train_test_split( labels, test_size=0.20, random_state=45)

In [19]:
class K_means:   
    
    def get_centroids(self,data,k):
        s = data.shape[0]
        centroid_d={}
        for i in range(k):
            r = np.random.randint(0, s-1)
            centroid_d[i] = data.iloc[r]
        return centroid_d
    
    def jaccard_similarity(self,centroid, dp):
        intersection = len(list(set(centroid).intersection(dp)))
        union = (len(set(centroid)) + len(set(dp))) - intersection
        return float(intersection) / union

    def SSE(self, centroid_d, centroids ,data):
        sse = 0
        for i in centroids:
            sse_cluster = 0
            for j in centroids[i]:
                dp = list(data.iloc[int(j)])
                for a,b in zip(centroid_d[i],dp):
                    sse_cluster = sse_cluster + (a-b)**2
            sse = sse + sse_cluster
        return sse

    def kmeans_clustering(self,data,k,max_i=20,cat='E',tol=15):
        centroid_d = self.get_centroids(data,k)
        centroids = {}
        count = 0
        cen_list = list(centroid_d.keys())
        con = False
        while((count<max_i) and not con):
            for i in cen_list:
                centroids[i] = []
            for i in range(data.shape[0]):
                p = data.iloc[i]
                if cat == 'E' :
                    distance = [np.linalg.norm(p-centroid_d[j])  for j in centroid_d]
                    index = np.argmin(distance)
                    centroids[index].append(i)
                elif cat == 'J' :
                    sim = [self.jaccard_similarity(list(p),centroid_d[j]) for j in centroid_d]
                    index = np.argmax(sim)
                    centroids[index].append(i)
                elif cat == 'C' :
                    sim = [1-scipy.spatial.distance.cosine(p,list(centroid_d[j]))  for j in centroid_d]
                    index = np.argmax(sim)
                    centroids[index].append(i)
                prev_centroids=dict(centroid_d)
      
            for i in centroids:
                if len(centroids[i]):
                    dps_centroid = centroids[i]
                    centroid_d[i] = np.average(data.iloc[dps_centroid],axis=0)
            
            tolerance = -1
            for i in centroid_d:
                previous_centroid = prev_centroids[i]
                new_centroid = centroid_d[i]
                change = np.sum(np.absolute(new_centroid - previous_centroid))
                tolerance = max(change, tolerance)
                
            print("Tolerance for the Iteration ",count,": ",tolerance)
            count = count +1
            if (tolerance < 10):
                con = True
                break
      
        return centroid_d,centroids
    

In [20]:
def predict_cluster_labels(C, S, labels):
    cluster_labels = np.zeros(10,dtype=int)
    for c in C:
        labels_of_points = []
        for point in S[c]:
            labels_of_points.extend(labels.iloc[point])
        counter = Counter(labels_of_points)
        try:
            cluster_labels[c] = max(counter, key=counter.get)
        except:
            cluster_labels[c] = np.random.randint(0,9)
    return cluster_labels

In [21]:
def jaccard_similarity(centroid, dp):
        intersection = len(list(set(centroid).intersection(dp)))
        union = (len(set(centroid)) + len(set(dp))) - intersection
        return float(intersection) / union

In [22]:
def accuracy(centroids, centroid_Labels, test_data, true_labels, cat = 'E'):
    y_real = list(true_labels['label']);
    y_pred = []
    for index in range(test_data.shape[0]):
        featureset = test_data.iloc[index]
        if cat == 'E':
            distances = [np.linalg.norm(featureset - centroids[centroid]) for centroid in centroids]
            c = distances.index(min(distances))
            y_pred.append(centroid_Labels[c])
        elif cat == 'J':
            similarity = [jaccard_similarity(featureset, centroids[centroid]) for centroid in centroids]
            c = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[c]) 
        elif cat == 'C':
            similarity = [1 - spatial.distance.cosine(featureset, centroids[centroid]) for centroid in centroids]
            c = similarity.index(max(similarity))
            y_pred.append(centroid_Labels[c])
    x = test_data.shape[0]
    correctly_classified = 0
    for i in range(0,len(y_pred)):
        if y_real[i] == y_pred[i]:
            correctly_classified = correctly_classified + 1
    accuracy = correctly_classified/x
    return accuracy

In [23]:
euclidean_model = K_means()
centroids_e,clusters_e = euclidean_model.kmeans_clustering(data,10, max_i = 100,cat ='E')

Tolerance for the Iteration  0 :  26238.460035523978
Tolerance for the Iteration  1 :  6148.730251646768
Tolerance for the Iteration  2 :  3070.096059260837
Tolerance for the Iteration  3 :  1930.4717539401506
Tolerance for the Iteration  4 :  1393.6363163689748
Tolerance for the Iteration  5 :  1407.4343744159344
Tolerance for the Iteration  6 :  1763.7872835875198
Tolerance for the Iteration  7 :  1518.992776967112
Tolerance for the Iteration  8 :  1253.8067343684243
Tolerance for the Iteration  9 :  895.5261024607411
Tolerance for the Iteration  10 :  856.2961561356065
Tolerance for the Iteration  11 :  872.4218401710948
Tolerance for the Iteration  12 :  1119.551785909638
Tolerance for the Iteration  13 :  1378.5085941170037
Tolerance for the Iteration  14 :  1691.2000939183101
Tolerance for the Iteration  15 :  1939.5481830125382
Tolerance for the Iteration  16 :  1718.53830592158
Tolerance for the Iteration  17 :  1251.6445814427566
Tolerance for the Iteration  18 :  827.99133549

In [24]:
Euclidean_SSE = euclidean_model.SSE(centroids_e,clusters_e,data)
print('SSE for euclidean kmeans model is: ',Euclidean_SSE)

SSE for euclidean kmeans model is:  25498995227.148094


In [25]:
cluster_labels_e = predict_cluster_labels(centroids_e,clusters_e,labels)
Euclidean_Accuracy = accuracy(centroids_e, cluster_labels_e,test_data,test_labels)
print('accuracy of euclidean kmeans model is : ',Euclidean_Accuracy)

accuracy of euclidean kmeans model is :  0.094


In [26]:
cosine_model = K_means()
centroids_cos,clusters_cos = cosine_model.kmeans_clustering(data,10, max_i=100,cat = 'C')

Tolerance for the Iteration  0 :  27955.27397260274
Tolerance for the Iteration  1 :  3720.527741406293
Tolerance for the Iteration  2 :  3743.0370047842875
Tolerance for the Iteration  3 :  3057.001114146061
Tolerance for the Iteration  4 :  1971.9081526070472
Tolerance for the Iteration  5 :  1172.5276087228413
Tolerance for the Iteration  6 :  1175.27803248838
Tolerance for the Iteration  7 :  1355.5718814521763
Tolerance for the Iteration  8 :  1334.4869276892573
Tolerance for the Iteration  9 :  1255.1812076722708
Tolerance for the Iteration  10 :  965.8345205628888
Tolerance for the Iteration  11 :  783.250677516633
Tolerance for the Iteration  12 :  677.0044544276327
Tolerance for the Iteration  13 :  446.2750071619556
Tolerance for the Iteration  14 :  608.0821383477632
Tolerance for the Iteration  15 :  783.6866125327665
Tolerance for the Iteration  16 :  1004.0822918601464
Tolerance for the Iteration  17 :  825.9002557714666
Tolerance for the Iteration  18 :  944.823180575655

In [29]:
cosine_SSE = cosine_model.SSE(centroids_cos,clusters_cos,data)
print('SSE for cosine kmeans model is: ',cosine_SSE)

SSE for cosine kmeans model is:  25422194579.297318


In [31]:
cluster_labels_cos = predict_cluster_labels(centroids_cos,clusters_cos,labels)
Cosine_Accuracy = accuracy(centroids_cos, cluster_labels_cos,test_data,test_labels)
print('accuracy of cosine kmeans model is : ',Cosine_Accuracy)

accuracy of cosine kmeans model is :  0.106


In [38]:
jaccard_model = K_means()
centroids_j,clusters_j = jaccard_model.kmeans_clustering(data,10, max_i=100,cat = 'J')

Tolerance for the Iteration  0 :  34996.33285468122
Tolerance for the Iteration  1 :  10509.357789126905
Tolerance for the Iteration  2 :  4981.976533834232
Tolerance for the Iteration  3 :  1479.6478193872017
Tolerance for the Iteration  4 :  1403.476122140516
Tolerance for the Iteration  5 :  1477.744997896016
Tolerance for the Iteration  6 :  741.6138816193759
Tolerance for the Iteration  7 :  1272.7584416703476
Tolerance for the Iteration  8 :  1009.3705836100851
Tolerance for the Iteration  9 :  850.0669581995794
Tolerance for the Iteration  10 :  1662.1532372364177
Tolerance for the Iteration  11 :  351.6606701105569
Tolerance for the Iteration  12 :  256.61452299076063
Tolerance for the Iteration  13 :  0.0


In [39]:
jaccard_SSE = jaccard_model.SSE(centroids_j,clusters_j,data)
print('SSE for jaccard kmeans model is: ',jaccard_SSE)

SSE for jaccard kmeans model is:  34361687572.938736


In [41]:
cluster_labels_j = predict_cluster_labels(centroids_j,clusters_j,labels)
Jaccard_Accuracy = accuracy(centroids_j, cluster_labels_j,test_data,test_labels)
print('accuracy of jaccard kmeans model is : ',Jaccard_Accuracy)

accuracy of jaccard kmeans model is :  0.1165
