In [None]:
import numpy as np
import pandas as pd
import scipy 
import sklearn
from collections import Counter
from sklearn.metrics import multilabel_confusion_matrix
from scipy import spatial
!pip install scikit-surprise
from surprise import SVD

In [6]:
data = pd.read_csv('data.csv')
label = pd.read_csv('label.csv',names=['label'],header=None)

In [7]:
class KMeans_Clustering:
    
    def SSE_Calculation(self, centroid_value, centroid,data):
        data_sse = 0
        for i in centroid:
            cluster_sse = 0
            for j in centroid[i]:
                dp = list(data.iloc[int(j)])
                for a,b in zip(centroid_value[i],dp):
                    cluster_sse += (a-b)**2
            data_sse+=cluster_sse
        return data_sse    
    
    def Centroids_Initialization(self,data,K_value):
        m = data.shape[0]
        centroid_value={}
        for i in range(K_value):
            r = np.random.randint(0, m-1)
            centroid_value[i] = data.iloc[r]
        return centroid_value
    
    def calculate_jaccard_similarity(self,centroid_value, dp):
        intersection_value = len(list(set(centroid_value).intersection(dp)))
        union_value = (len(set(centroid_value)) + len(set(dp))) - intersection_value
        return float(intersection_value) / union_value

    def Kmeans_training(self ,data ,K_value ,max_iteration = 20 ,mode_value = 1 ,tolerance = 10):
        centroid_value = self.Centroids_Initialization(data,K_value)
        new_centroid_value = {}
        count = 0
        centroid = {}
        convergence = False
        while((count<max_iteration) and not convergence):
            
            for i in list(centroid_value.keys()):
                centroid[i]=[]
            for i in range(data.shape[0]):
                x = data.iloc[i]
                if mode_value==1 :
                    distance_value = [np.linalg.norm(x-centroid_value[j])  for j in centroid_value]
                    idx = np.argmin(distance_value)
                    centroid[idx].append(i)
                elif mode_value==2 :
                    distance_value = [self.calculate_jaccard_similarity(list(x),centroid_value[j]) for j in centroid_value]
                    idx = np.argmax(distance_value)
                    centroid[idx].append(i)
                elif mode_value==3 :
                    distance_value = [1-scipy.spatial.distance.cosine(x,list(centroid_value[j]))  for j in centroid_value]
                    idx = np.argmax(distance_value)
                    centroid[idx].append(i)
                
                prev_centroids=dict(centroid_value)
                
            
            for i in centroid:
                if len(centroid[i]):
                    dps_centroid = centroid[i]
                    centroid_value[i] = np.average(data.iloc[dps_centroid],axis=0)
            
            
            current_tolerance=-1
            for i in centroid_value:
                prev_centroid = prev_centroids[i]
                new_centroid = centroid_value[i]
                change = np.sum(np.absolute(new_centroid-prev_centroid))
                current_tolerance = max(change, current_tolerance)
                
            print("Iteration ",count," Tolerance: ",current_tolerance)
            
            count+=1
            if (current_tolerance<10):
                convergence = True
                break
        return centroid_value,centroid

In [11]:
def cluster_labels_prediction(Centroids, Centroids_Set, labels):
    cluster_label = np.zeros(10,dtype=int)
    for c in Centroids_Set:
        points_labels = []
        for point in Centroids_Set[c]:
            points_labels.extend(labels.iloc[point])
        counter_value = Counter(points_labels)
        try:
            cluster_label[c] = max(counter_value, key=counter_value.get)
        except:
            cluster_label[c] = np.random.randint(0,9)
    return cluster_label

In [9]:
def calculate_jaccard_similarity(centroid_value, dp):
  intersection_value = len(list(set(centroid_value).intersection(dp)))
  union_value = (len(set(centroid_value)) + len(set(dp))) - intersection_value
  return float(intersection_value) / union_value

In [10]:
def accuracy_value(centroids_set, centroid_labels, test_data, true_label, mode_value=1):
    y_true = list(true_label['label']);
    y_pred = []
    for index in range(test_data.shape[0]):
        feature_set = test_data.iloc[index]
        if mode_value==1:
            distances = [np.linalg.norm(feature_set - centroids_set[centroid]) for centroid in centroids_set]
            classification_value = distances.index(min(distances))
            y_pred.append(centroid_labels[classification_value])
        elif mode_value==2:
            similarity = [calculate_jaccard_similarity(feature_set, centroids_set[centroid]) for centroid in centroids_set]
            classification_value = similarity.index(max(similarity))
            y_pred.append(centroid_labels[classification_value]) 
        elif mode_value==3:
            similarity = [1 - spatial.distance.cosine(feature_set, centroids_set[centroid]) for centroid in centroids_set]
            classification_value = similarity.index(max(similarity))
            y_pred.append(centroid_labels[classification_value])
    denominator_value = test_data.shape[0]
    correctly_classified_value = 0
    for i in range(0,len(y_pred)):
        if y_true[i] == y_pred[i]:
            correctly_classified_value += 1
    accuracy_value = correctly_classified_value/denominator_value
    return accuracy_value

In [None]:
model_1 = KMeans_Clustering()
centroids_1,clusters_1 = model_1.Kmeans_training(data,10, max_iteration=100,mode_value=1)

In [None]:
SSE_of_Euclidean = model_1.SSE_Calculation(centroids_1,clusters_1,data)

In [None]:
print("Euclidean-K-Means SSE value : ",SSE_of_Euclidean)

In [None]:
cluster_labels_1 = cluster_labels_prediction(centroids_1,clusters_1,label)
cluster_labels_1

In [None]:
Euclidean_accuracy = accuracy_value(centroids_1, cluster_labels_1,test_data,test_label)
Euclidean_accuracy

In [None]:
model_2 = KMeans_Clustering()
centroids_2,clusters_2 = model_2.Kmeans_training(data,10, max_iteration=100,mode_value=2)
SSE_of_Jaccard = model_2.SSE_Calculation(centroids_2,clusters_2,data)

In [None]:
print("Jarcard-K-Means SSE value : ",SSE_of_Jaccard)

In [None]:
cluster_labels_2 = cluster_labels_prediction(centroids_2,clusters_2,label)
cluster_labels_2

In [None]:
Jaccard_accuracy = accuracy_value(centroids_2, cluster_labels_2,test_data,test_label)
Jaccard_accuracy

In [None]:
model_3 = KMeans_Clustering()
centroids_3,clusters_3 = model_3.Kmeans_training(data,10, max_iteration = 100,mode_value=3)

In [None]:
SSE_of_Cosine = model_3.SSE_Calculation(centroids_3,clusters_3,data)

In [None]:
print("Euclidean-K-Means SSE value : ",SSE_of_Euclidean)
print("Jarcard-K-Means SSE value : ",SSE_of_Jaccard)
print("Cosine-K-Means SSE value : ",SSE_of_Cosine)

In [None]:
cluster_labels_3 = cluster_labels_prediction(centroids_3,clusters_3,label)
cluster_labels_3

In [None]:
Cosine_accuracy = accuracy_value(centroids_3, cluster_labels_3,test_data,test_label)
Euclidean_accuracy
Jaccard_accuracy
Cosine_accuracy

In [None]:
print("Accuracy of Euclidean-K-Means : ",Euclidean_accuracy)
print("Accuracy of Jarcard-K-Means : ",Jaccard_accuracy)
print("Accuracy of Cosine-K-Means :",Cosine_accuracy)