In [1]:
# K-means

# Choose K (the number of clusters).
# Randomly initialize K centroids (cluster centers).
# Assign each data point to the nearest centroid.
# Recalculate the centroids based on the assigned points.
# Repeat the process until centroids don't change significantly.

In [3]:
import numpy as np

def k_means(X, K, max_iters=100):
    # Step 1: Randomly initialize K centroids
    centroids = X[np.random.choice(range(X.shape[0]), K, replace=False)]
    print(f"shape of the data is: {X.shape}")
    print(f"Initial Centroids are {centroids}")
    
    for _ in range(max_iters):
        # Step 2: Assign each point to the nearest centroid
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        labels = np.argmin(distances, axis=1)
        
        # Step 3: Recompute centroids as the mean of the assigned points
        new_centroids = np.array([X[labels == k].mean(axis=0) for k in range(K)])
        
        # If centroids don't change, break
        if np.all(centroids == new_centroids):
            break
        
        centroids = new_centroids
    
    return centroids, labels


X = np.random.rand(100, 2)
centroids, labels = k_means(X, 3)
print(f"Final Centroids are: {centroids}")


shape of the data is: (100, 2)
Initial Centroids are [[0.57249521 0.32411916]
 [0.46733017 0.44358338]
 [0.14254871 0.57289113]]
Final Centroids are: [[0.64494801 0.16046111]
 [0.71111596 0.79134891]
 [0.22108557 0.47647313]]


In [8]:
print(X[0])

[0.94180168 0.17270976]


In [9]:
import numpy as np

class KMeans:
    def __init__(self, K, max_iters=100):
        self.K = K  # Number of clusters
        self.max_iters = max_iters  # Maximum number of iterations
        self.centroids = None  # To store the centroids
        self.labels = None  # To store the cluster assignments
    
    def fit(self, X):
        # Step 1: Randomly initialize K centroids
        self.centroids = X[np.random.choice(range(X.shape[0]), self.K, replace=False)]
        print(f"shape of the data is: {X.shape}")
        print(f"Initial Centroids are {self.centroids}")
        
        for _ in range(self.max_iters):
            # Step 2: Assign each point to the nearest centroid
            distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
            self.labels = np.argmin(distances, axis=1)
            
            # Step 3: Recompute centroids as the mean of the assigned points
            new_centroids = np.array([X[self.labels == k].mean(axis=0) for k in range(self.K)])
            
            # If centroids don't change, break
            if np.all(self.centroids == new_centroids):
                break
            
            self.centroids = new_centroids
    
    def predict(self, X):
        # Assign new data points to the nearest centroid
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
        return np.argmin(distances, axis=1)
    
    def get_centroids(self):
        # Return the final centroids
        return self.centroids
    
    def get_labels(self):
        # Return the labels (cluster assignments) for the training data
        return self.labels

X = np.random.rand(100, 2)  # Generate random 2D data
kmeans = KMeans(K=3)  # Create a KMeans instance with 3 clusters
kmeans.fit(X)  # Fit the model to the data
print(f"Final Centroids are: {kmeans.get_centroids()}")


shape of the data is: (100, 2)
Initial Centroids are [[0.62858744 0.90034202]
 [0.99865033 0.9489701 ]
 [0.02275996 0.01538559]]
Final Centroids are: [[0.35729255 0.80049632]
 [0.88367998 0.57822229]
 [0.35682423 0.23522763]]
