In [55]:
# K-Means steps:
# 1. initialize:
# - k = number of clusers
# - Data = Data to cluster
# - iterations - no. of iterations

# 2. repeat:
# - initiate k random centroids from data
# - calculate euclidiean distance b/w each data point and centroids and assign that data point to centroid/cluster with minimum distance
# - calculate new centroid - mean of all datapoints
# - untill old centroid == new centroid or no. of iterations

# 3. Output:
# - final k clusters and their corresponding centroids


In [2]:
import numpy as np
import sklearn.datasets as ds
from sklearn.model_selection import train_test_split

In [None]:
def getLabels(self, clusters):
    n_labels = np.empty(self.n_samples)
    for i, cluster in enumerate(clusters):
        for sample_idx in cluster:
            n_labels[sample_idx] = i

    return n_labels

In [50]:
# https://www.youtube.com/watch?v=gnGyTRFLX-k

class KMeans:
    def __init__(self, k=3, iterations=2000):
        self.k = k
        self.iterations = iterations

    def euclideanDistance(self, v1, v2):
        return np.sqrt(np.sum((v1-v2)**2))

    def createClusters(self, centroids):
        clusters = [[] for _ in range(self.k)]
        for i, samples in enumerate(self.X):
            distances = [self.euclideanDistance(samples, c) for c in centroids]
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(i)

        return clusters

    def getNewCentroid(self, clusters):
        new_centroids = np.zeros((self.k, self.n_features), dtype=np.float64)
        for i, c in enumerate(clusters):
            cluster_mean = np.mean(self.X[c], axis=0)
            new_centroids[i] = cluster_mean

        return new_centroids

    def isConverged(self, old_centroids, new_centroids):
        distance = [self.euclideanDistance(old_centroids[i], new_centroids[i]) for i in range(self.k)]
        return np.sum(distance) == 0

    def getLabels(self, clusters):
        n_labels = np.empty(self.n_samples)
        for i, cluster in enumerate(clusters):
            for sample in cluster:
                n_labels[sample] = i

        return n_labels

    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape
        rand_idx = np.random.choice(self.n_samples, self.k, replace=False)
        centroids = [self.X[i] for i in rand_idx]

        for _ in range(self.iterations):
            clusters = self.createClusters(centroids)
            old_centroids = centroids
            centroids = self.getNewCentroid(clusters)

            if self.isConverged(old_centroids, centroids):
                break

        return self.getLabels(clusters)

In [51]:
X, y = ds.make_blobs(n_samples=150, n_features=2, centers=3, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [52]:
def accuracy(y_true, y_pred):
    return np.sum(y_true==y_pred)/len(y_true)

In [53]:
kmeans = KMeans()
y_pred = kmeans.predict(X_train)

In [54]:
accuracy(y_train, y_pred)

np.float64(0.35)

## Optimization
Here are some ways to optimize the k-means clustering algorithm:

Random initialization of centroids: Instead of initializing the centroids using the first k data points, we can randomly initialize them to improve the convergence of the algorithm. This can be done by selecting k random data points from the input dataset as the initial centroids.

Early stopping: We can stop the k-means algorithm if the cluster assignments and centroids do not change after a certain number of iterations. This helps to avoid unnecessary computation.

Vectorization: We can use numpy arrays and vectorized operations to speed up the computation. This avoids the need for loops and makes the code more efficient.