In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import stats
from collections import Counter
import time

In [21]:
# Load data and labels
data = np.genfromtxt('/kaggle/input/hw3-data/data.csv', delimiter=',')
labels = np.genfromtxt('/kaggle/input/hw3-data/label.csv', delimiter=',')

In [41]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def initialize_centroids(self, X):
        # ============= Randomly select centroids ===================
        
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):

        # =========== Find the closest centroid to the sample ======================
        distances = [self.euclidean_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        #  ============= Compute the sum of squared errors =================
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        # ============ Initialize centroids ====================
        self.centroids = self.initialize_centroids(X)
        
        for _ in range(self.max_iter):
            # =========== Assign each dp to the nearest centroid ====================
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            # ================ Update centroids ====================
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            for i in range(self.n_clusters):
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            # ============== Check if convergence ( if same centroids)================
            if np.allclose(self.centroids, new_centroids):
                break
                
            self.centroids = new_centroids
            
        # ========= Compute SSE ===========
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        return self.labels, self.centroids, self.sse


# ============ Set K ==============
K = len(np.unique(labels))

# ================ Initialize K-means ===========
kmeans = KMeans(n_clusters=K)

# ========Fit K-means model ===============
labels, centroids, sse = kmeans.fit(data)

print("Euclidean sse = ", sse)

Euclidean sse =  58095386156.0


In [23]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def cosine_similarity_distance(self, x1, x2):
        similarity = cosine_similarity(x1.reshape(1, -1), x2.reshape(1, -1))[0, 0]
        return 1 - similarity
    
    def initialize_centroids(self, X):

        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        
        distances = [self.cosine_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        for _ in range(self.max_iter):
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)

            if np.allclose(self.centroids, new_centroids):
                break
                
            self.centroids = new_centroids
            
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        return self.labels, self.centroids, self.sse


K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse = kmeans.fit(data)

print("Cosine sse = ", sse)

Cosine sse =  58095386156.0


In [24]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def jaccard_similarity_distance(self, x1, x2):
        intersection = np.sum(np.minimum(x1, x2))
        union = np.sum(np.maximum(x1, x2))
        return 1 - (intersection / union)
    
    def initialize_centroids(self, X):
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        distances = [self.jaccard_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        for _ in range(self.max_iter):
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)

            if np.allclose(self.centroids, new_centroids):
                break
                
            self.centroids = new_centroids

        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        return self.labels, self.centroids, self.sse


K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse = kmeans.fit(data)

print("jaccard sse = ", sse)

jaccard sse =  58095386156.0


In [25]:
from collections import Counter
class KMeans:
    def __init__(self, n_clusters=8, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def initialize_centroids(self, X):
        
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        distances = [self.euclidean_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        for _ in range(self.max_iter):
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
            if np.allclose(self.centroids, new_centroids):
                break
                
            self.centroids = new_centroids
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        return self.labels, self.centroids, self.sse



K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
cluster_labels_ouput, centroids, sse = kmeans.fit(data)

def assign_majority_labels(cluster_labels, labels):
    majority_labels = []
    for i in range(K):
        cluster_indices = np.where(cluster_labels == i)[0]
        cluster_labels_true = labels[cluster_indices]
        majority_label = Counter(cluster_labels_true).most_common(1)[0][0]
        majority_labels.append(majority_label)
    return majority_labels

majority_labels = assign_majority_labels(cluster_labels_ouput, labels)
assigned_labels = [majority_labels[label] for label in cluster_labels_ouput]
accuracy = np.sum(assigned_labels == labels) / len(labels)

print("Euclidean accuracy = ", accuracy)

Euclidean accuracy =  1.0


In [26]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def cosine_similarity_distance(self, x1, x2):
        similarity = cosine_similarity(x1.reshape(1, -1), x2.reshape(1, -1))[0, 0]
        return 1 - similarity
    
    def initialize_centroids(self, X):
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        distances = [self.cosine_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        for _ in range(self.max_iter):
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]

            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)

            if np.allclose(self.centroids, new_centroids):
                break
                
            self.centroids = new_centroids

        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        return self.labels, self.centroids, self.sse

K = len(np.unique(labels))

kmeans = KMeans(n_clusters=K)

cluster_labels_ouput, centroids, sse = kmeans.fit(data)

def assign_majority_labels(cluster_labels, labels):
    majority_labels = []
    for i in range(K):
        cluster_indices = np.where(cluster_labels == i)[0]
        cluster_labels_true = labels[cluster_indices]
        majority_label = Counter(cluster_labels_true).most_common(1)[0][0]
        majority_labels.append(majority_label)
    return majority_labels

majority_labels = assign_majority_labels(cluster_labels_ouput, labels)
assigned_labels = [majority_labels[label] for label in cluster_labels_ouput]
accuracy = np.sum(assigned_labels == labels) / len(labels)

print("Cosine accuracy = ", accuracy)

Cosine accuracy =  1.0


In [27]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def jaccard_similarity_distance(self, x1, x2):
        intersection = np.sum(np.minimum(x1, x2))
        union = np.sum(np.maximum(x1, x2))
        return 1 - (intersection / union)
    
    def initialize_centroids(self, X):
        
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        distances = [self.jaccard_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        for _ in range(self.max_iter):
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)

            if np.allclose(self.centroids, new_centroids):
                break
                
            self.centroids = new_centroids

        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        return self.labels, self.centroids, self.sse



K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
cluster_labels_ouput, centroids, sse = kmeans.fit(data)

def assign_majority_labels(cluster_labels, labels):
    majority_labels = []
    for i in range(K):
        cluster_indices = np.where(cluster_labels == i)[0]
        cluster_labels_true = labels[cluster_indices]
        majority_label = Counter(cluster_labels_true).most_common(1)[0][0]
        majority_labels.append(majority_label)
    return majority_labels

majority_labels = assign_majority_labels(cluster_labels_ouput, labels)
assigned_labels = [majority_labels[label] for label in cluster_labels_ouput]
accuracy = np.sum(assigned_labels == labels) / len(labels)

print("Jaccard accuracy = ", accuracy)

Jaccard accuracy =  1.0


In [28]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def initialize_centroids(self, X):
  
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):

        distances = [self.euclidean_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):

        self.centroids = self.initialize_centroids(X)
        iter_count = 0
        for iteration in range(self.max_iter):
            
            
            start_time = time.time()
            
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            prev_sse = float('inf')
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       

            if np.allclose(self.centroids, new_centroids) or iter_count >= self.max_iter or new_sse > prev_sse:
                break
                
            self.centroids = new_centroids
            prev_sse = new_sse
            

        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime


K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("Euclidean iteration = ", iteration)
print("Euclidean ConvergenceTime = ", ConvergenceTime)

Euclidean iteration =  2
Euclidean ConvergenceTime =  0.3355274200439453


In [29]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def cosine_similarity_distance(self, x1, x2):
        similarity = cosine_similarity(x1.reshape(1, -1), x2.reshape(1, -1))[0, 0]
        return 1 - similarity
    
    def initialize_centroids(self, X):
        
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        distances = [self.cosine_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        iter_count = 0
        for iteration in range(self.max_iter):
            
            start_time = time.time()
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            prev_sse = float('inf')

            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       

            if np.allclose(self.centroids, new_centroids) or iter_count >= self.max_iter or new_sse > prev_sse:
                break
                
            self.centroids = new_centroids
            prev_sse = new_sse
            
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime

K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("Cosine iteration = ", iteration)
print("Cosine ConvergenceTime = ", ConvergenceTime)

Cosine iteration =  2
Cosine ConvergenceTime =  3.618988037109375


In [30]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def jaccard_similarity_distance(self, x1, x2):
        intersection = np.sum(np.minimum(x1, x2))
        union = np.sum(np.maximum(x1, x2))
        return 1 - (intersection / union)
    
    def initialize_centroids(self, X):

        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        distances = [self.jaccard_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        iter_count = 0
        for iteration in range(self.max_iter):
           
            start_time = time.time()

            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            prev_sse = float('inf')

            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       

            if np.allclose(self.centroids, new_centroids) or iter_count >= self.max_iter or new_sse > prev_sse:
                break
                
            self.centroids = new_centroids
            prev_sse = new_sse

        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime

K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("Jaccard iteration = ", iteration)
print("Jaccard ConvergenceTime = ", ConvergenceTime)

Jaccard iteration =  2
Jaccard ConvergenceTime =  0.42817163467407227


In [31]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def initialize_centroids(self, X):
        
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        distances = [self.euclidean_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        iter_count = 0
        prev_sse = float('inf')
        for iteration in range(self.max_iter):
            
            start_time = time.time()
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                #print(iter_count)
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       

            if np.allclose(self.centroids, new_centroids) :
                break
            
            prev_sse = new_sse
            self.centroids = new_centroids
            
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime

K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("Euclidean Condition 1 sse = ", sse)

Euclidean Condition 1 sse =  58095386156.0


In [32]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def initialize_centroids(self, X):
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        distances = [self.euclidean_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        iter_count = 0
        prev_sse = float('inf')
        for iteration in range(self.max_iter):
            
            start_time = time.time()
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                #print(iter_count)
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       
            
            if iter_count >= self.max_iter :
                break
            
            prev_sse = new_sse
            self.centroids = new_centroids

        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime


K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("Euclidean Condition 2 sse = ", sse)

Euclidean Condition 2 sse =  58095386156.0


In [33]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def initialize_centroids(self, X):

        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):

        distances = [self.euclidean_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        iter_count = 0
        prev_sse = float('inf')
        for iteration in range(self.max_iter):
            
            start_time = time.time()
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                #print(iter_count)
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       

            if new_sse > prev_sse :
                break
            
            prev_sse = new_sse
            self.centroids = new_centroids
            
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime

K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("Euclidean Condition 3 sse = ", sse)

Euclidean Condition 3 sse =  58095386156.0


In [34]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def cosine_similarity_distance(self, x1, x2):
        similarity = cosine_similarity(x1.reshape(1, -1), x2.reshape(1, -1))[0, 0]
        return 1 - similarity
    
    def initialize_centroids(self, X):

        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        
        distances = [self.cosine_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        iter_count = 0
        prev_sse = float('inf')
        for iteration in range(self.max_iter):
            
            start_time = time.time()
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       
                    
            if np.allclose(self.centroids, new_centroids):
                break
                
            prev_sse = new_sse
            self.centroids = new_centroids
            

        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime


K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("Cosine Condition 1 sse = ", sse)

Cosine Condition 1 sse =  58095386156.0


In [35]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def cosine_similarity_distance(self, x1, x2):
        similarity = cosine_similarity(x1.reshape(1, -1), x2.reshape(1, -1))[0, 0]
        return 1 - similarity
    
    def initialize_centroids(self, X):

        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        
        distances = [self.cosine_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        iter_count = 0
        prev_sse = float('inf')
        for iteration in range(self.max_iter):
            
            start_time = time.time()
 
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            
    
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       
            #print(iter_count)
            if iter_count >= self.max_iter :
                break
                
            prev_sse = new_sse    
            self.centroids = new_centroids
            
            
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime


K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("Cosine Condition 2 sse = ", sse)

Cosine Condition 2 sse =  58095386156.0


In [36]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    
    def cosine_similarity_distance(self, x1, x2):
        similarity = cosine_similarity(x1.reshape(1, -1), x2.reshape(1, -1))[0, 0]
        return 1 - similarity
    
    def initialize_centroids(self, X):
        
        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):

        distances = [self.cosine_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        iter_count = 0
        prev_sse = float('inf')
        for iteration in range(self.max_iter):
            
            start_time = time.time()
            
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       
                    
            if new_sse > prev_sse:
                break
            
            prev_sse = new_sse
            self.centroids = new_centroids

        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime


K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("Cosine Condition 3 sse = ", sse)

Cosine Condition 3 sse =  58095386156.0


In [37]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def jaccard_similarity_distance(self, x1, x2):
        
        intersection = np.sum(np.minimum(x1, x2))
        union = np.sum(np.maximum(x1, x2))
        return 1 - (intersection / union)
    
    def initialize_centroids(self, X):

        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        
        distances = [self.jaccard_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        iter_count = 0
        prev_sse = float('inf')
        for iteration in range(self.max_iter):
            
            start_time = time.time()
            
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            

            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       
                    
            if np.allclose(self.centroids, new_centroids) :
                break
              
            prev_sse = new_sse
            self.centroids = new_centroids
            
            
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime


K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("jaccard Condition 1 sse = ", sse)

jaccard Condition 1 sse =  58095386156.0


In [38]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def jaccard_similarity_distance(self, x1, x2):
        intersection = np.sum(np.minimum(x1, x2))
        union = np.sum(np.maximum(x1, x2))
        return 1 - (intersection / union)
    
    def initialize_centroids(self, X):

        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        
        distances = [self.jaccard_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):
        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        
        self.centroids = self.initialize_centroids(X)
        
        iter_count = 0
        prev_sse = float('inf')
        for iteration in range(self.max_iter):
            
            start_time = time.time()
            
            # Assign each data point to the nearest centroid based on specified distance metric
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       
                    
            if iter_count >= self.max_iter :
                break
               
            prev_sse = new_sse
            self.centroids = new_centroids
            
            
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime



K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("jaccard Condition 2 sse = ", sse)

jaccard Condition 2 sse =  58095386156.0


In [39]:
class KMeans:
    def __init__(self, n_clusters=8, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def jaccard_similarity_distance(self, x1, x2):
        intersection = np.sum(np.minimum(x1, x2))
        union = np.sum(np.maximum(x1, x2))
        return 1 - (intersection / union)
    
    def initialize_centroids(self, X):

        centroids_indices = np.random.choice(len(X), size=self.n_clusters, replace=False)
        return X[centroids_indices]
    
    def closest_centroid(self, sample, centroids):
        

        distances = [self.jaccard_similarity_distance(sample, centroid) for centroid in centroids]
        return np.argmin(distances)
    
    def compute_sse(self, X, centroids, labels):

        sse = 0
        for i, centroid in enumerate(centroids):
            cluster_points = X[labels == i]
            sse += np.sum((cluster_points - centroid) ** 2)
        return sse
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        
        iter_count = 0
        
        prev_sse = float('inf')
        for iteration in range(self.max_iter):
           # print(iteration)
            start_time = time.time()
            
            labels = [self.closest_centroid(sample, self.centroids) for sample in X]
            
            new_centroids = np.zeros((self.n_clusters, X.shape[1]))
            
            for i in range(self.n_clusters):
                iter_count += 1
                cluster_points = X[labels == i]
                if len(cluster_points) > 0:
                    new_centroids[i] = cluster_points.mean(axis=0)
                    
            new_sse = self.compute_sse(X, new_centroids, labels)       
           
            if new_sse > prev_sse:
                break
            
            prev_sse = new_sse
            self.centroids = new_centroids
            
            
        self.labels = np.array(labels)
        self.sse = self.compute_sse(X, self.centroids, self.labels)
        
        end_time = time.time()
        convergenceTime = end_time - start_time
        
        return self.labels, self.centroids, self.sse, iteration + 1, convergenceTime



K = len(np.unique(labels))
kmeans = KMeans(n_clusters=K)
labels, centroids, sse, iteration, ConvergenceTime = kmeans.fit(data)

print("jaccard Condition 3 sse = ", sse)

jaccard Condition 3 sse =  58095386156.0
