In [9]:
"""Kmeans
1. assign centroids (start with random)
2. Assign points to closest centroid
3. find new centroid
4. Measure sum of squares

based on: https://gist.github.com/ImadDabbura/6e2230b33373991aa3ccdbff6ebb3fd7#file-kmeans-py
"""


'Kmeans\n1. assign centroids (start with random)\n2. Assign points to closest centroid\n3. find new centroid\n4. Measure sum of squares\n5. \n'

In [7]:
import numpy.typing as npt
import numpy as np
from sklearn.datasets import make_blobs

In [8]:
class Kmeans:
    def __init__(self, max_iter=100, n_clusters=3, random_state=0, n_init=3):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.max_iter = max_iter
        self.n_init

    def _initialize_centroids(self, X):
        np.random.RandomState(self.random_state)
        random_idx = np.random.permutation(X.shape[0])
        return X[random_idx[:self.n_clusters]]

    def _calculate_centroids(self, X, labels):
        """Given points with labels, calculate their centroids

        calculate means of all points with a label
        this comes after the initialization part that makes initial
        labels

        Use axis=0 because at the end we want a vector of means from 
        each dimensions.
        """

        centroids = np.zeros((self.n_clusters, X.shape[1]))
        for i in range(self.n_clusters):
            centroids[i, :] = np.mean(X[labels == i], axis=0)
        return centroids
    
    def _calculate_distance(self, X, centroids):
        """keep track distance from point to each cluster
        axis 0 is the index of the point
        axis 1 is the index of the cluster
        value at i j is the distance for point i to cluster j (in indices)
        """
        self.distances = np.zeros((X.shape[0], self.n_clusters))
        for i in range(self.n_clusters):
            self.distances[:, i] = np.linalg.norm(X-centroids[i, :], axis=1)
        return self.distances

    def _find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)

    def _update_centroids(self, X, centroids) -> None:
        dist = self._calculate_distance(X, self.centroids)
        labels = self._find_closest_cluster(dist)
        centroids = self._calculate_centroids(X, labels)
        return centroids

    def _check_stopping_criterion(self, old_centroids, new_centroids):
        return np.all(old_centroids == new_centroids)

    def _compute_sse(self, X, labels):
        distance = np.zeros(X.shape[0])
        for i in range(self.n_clusters):
            distance[labels == i] = np.linalg.norm(X[labels == i] - centroids[i], axis=1)
        return np.sum(np.square(distance))

    def fit(self, X: npt.ArrayLike):
        self.centroids = self._initialize_centroids(X)
        for i in range(self.max_iter):
            new_centroids = self._one_iteration(X, self.centroids)
            if self._check_stopping_criterion(self.centroids, new_centroids):
                break
            self.centroids = new_centroids
        self._error = self._compute_sse(X, self.centroids, self.labels)
    
    def predict(self):
        distance = self._calculate_distance(X, self.centroids)
        return self._find_closest_cluster(distance)

In [62]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=20, n_features=2, centers=3)

In [63]:
# X = np.array([[1, 2], [1, 4], [1, 0],
#               [10, 2], [10, 4], [10, 0]])

In [64]:
z = np.zeros((X.shape[0], X.shape[1]))

In [65]:
X

array([[ 3.71484359, -4.4255989 ],
       [ 3.70237994, -6.39217025],
       [ 8.05893211, -0.14708853],
       [ 8.0894992 ,  0.49054904],
       [ 3.79074203,  8.15207942],
       [ 3.83502883,  9.00940876],
       [ 8.52225541,  2.07436062],
       [ 9.28516481,  0.26519391],
       [ 2.10390175, -6.37276099],
       [ 8.26827564,  2.47078827],
       [ 3.10314681, -7.6509839 ],
       [ 8.70491892,  0.71898582],
       [ 3.81603864, 11.07256391],
       [ 6.49861228,  9.98923315],
       [ 3.58542397,  9.19161513],
       [ 1.43307457, -7.36466298],
       [-0.80906783, -6.43390231],
       [ 0.67071901, -8.49046643],
       [ 6.08596313, 10.04060336],
       [ 3.89785137,  9.18125094]])

In [66]:
y

array([1, 1, 2, 2, 0, 0, 2, 2, 1, 2, 1, 2, 0, 0, 0, 1, 1, 1, 0, 0])

In [67]:
# use the index that changes is the 1st axis
a = np.linalg.norm(X, axis=1)
print(a)

# use the index that changes is the 0th axis
b = np.linalg.norm(X, axis=0)
print(b)

# Treats every value in the np.array as a vector
c = np.linalg.norm(X) # defaults to axis=-1
print(c)

[ 5.77806097  7.38697893  8.0602743   8.10435905  8.99033504  9.79167465
  8.771078    9.28895115  6.71107184  8.62955252  8.25633543  8.7345609
 11.71169596 11.91707768  9.86615699  7.50279705  6.48457321  8.51691753
 11.74106738  9.97439793]
[25.08086928 31.27588945]
40.09028891000603


Compute centroids

make them random

In [82]:
np.random.RandomState(0)
idxs = np.random.permutation(X.shape[0])[:3]
n_clusters = 3
centroids = X[idxs]
print(centroids)

[[ 8.52225541  2.07436062]
 [ 3.58542397  9.19161513]
 [ 1.43307457 -7.36466298]]


In [83]:
distances = np.zeros((X.shape[0], n_clusters))
for i in range(centroids.shape[0]):
    distances[:,i] = np.linalg.norm(X - centroids[i,:], axis=1)

In [84]:
distances

array([[ 8.08459536, 13.61782903,  3.72082888],
       [ 9.742348  , 15.58422425,  2.4689044 ],
       [ 2.26925203, 10.35488585,  9.79772265],
       [ 1.64186998,  9.7977163 , 10.29622964],
       [ 7.70232983,  1.05961785, 15.69483644],
       [ 8.370483  ,  0.30903357, 16.54930844],
       [ 0.        ,  8.66184832, 11.80473005],
       [ 1.96344466, 10.59094148, 10.94851759],
       [10.60891734, 15.63472776,  1.19744673],
       [ 0.47080846,  8.19137433, 11.97731502],
       [11.1332414 , 16.84950247,  1.69443824],
       [ 1.36762817,  9.89922603, 10.87313654],
       [10.1546117 ,  1.89503336, 18.59058507],
       [ 8.16947603,  3.02040736, 18.07809125],
       [ 8.66184832,  0.        , 16.69559681],
       [11.80473005, 16.69559681,  0.        ],
       [12.62791081, 16.23170827,  2.42765689],
       [13.1629098 , 17.92070069,  1.35963944],
       [ 8.33045878,  2.6407342 , 18.0164555 ],
       [ 8.47897416,  0.31259925, 16.72849043]])

In [88]:
# assign point indices to centroid
points_to_centroids = np.argmin(distances, axis=1)
new_centroids = np.zeros((centroids.shape[0], centroids.shape[1]))
#calcultate new centroids
for i in range(n_clusters):
    new_centroids[i,:] = np.mean(X[points_to_centroids == i], axis = 0)

In [89]:
centroids

array([[ 8.52225541,  2.07436062],
       [ 3.58542397,  9.19161513],
       [ 1.43307457, -7.36466298]])

In [90]:
new_centroids

array([[ 8.48817435,  0.97879819],
       [ 4.50138004,  9.51953638],
       [ 1.98842826, -6.73293511]])

In [None]:
points_to_centroids

In [73]:
X[points_to_centroids == 0]

array([[ 3.79074203,  8.15207942],
       [ 3.83502883,  9.00940876],
       [ 8.52225541,  2.07436062],
       [ 8.26827564,  2.47078827],
       [ 3.81603864, 11.07256391],
       [ 6.49861228,  9.98923315],
       [ 3.58542397,  9.19161513],
       [ 6.08596313, 10.04060336],
       [ 3.89785137,  9.18125094]])

In [74]:
n_clusters = 3

In [75]:
d = np.zeros((X.shape[0], n_clusters))

In [76]:
centroids = np.zeros((X.shape[0]))

In [27]:
X

array([[ 1,  2],
       [ 1,  4],
       [ 1,  0],
       [10,  2],
       [10,  4],
       [10,  0]])

In [28]:
X - np.array([0, 1])

array([[ 1,  1],
       [ 1,  3],
       [ 1, -1],
       [10,  1],
       [10,  3],
       [10, -1]])

In [19]:
np.linalg.norm(X, axis=0)

array([17.40689519,  6.32455532])

In [20]:
np.linalg.norm(X, axis=-1)

array([ 2.23606798,  4.12310563,  1.        , 10.19803903, 10.77032961,
       10.        ])

In [18]:
np.linalg.norm([1, 2])

2.23606797749979

In [None]:
indices = np.random.permutation(X.shape[0])[:3]
X[indices]

array([[-6.15567947,  6.34747374],
       [ 1.54051699, 11.07247095],
       [ 2.79473606,  9.54914352]])

In [None]:
n_clusters = 3
centroids = np.zeros([n_clusters, X.shape[1]])

In [None]:
centroids = np.zeros([n_clusters, X.shape[1]])
for i in range(n_clusters):
    centroids[i, :] = np.mean(X[y==i,:])

In [None]:
centroids

array([[ 5.79586995,  5.79586995],
       [ 0.98315303,  0.98315303],
       [-2.39656233, -2.39656233]])

In [None]:
for k in range(3):
    row_norm = np.linalg.norm(X - centroids[k,:], axis = 1)

In [None]:
centroids[k,:]

array([0., 0.])

In [None]:
X - centroids[k,:]

array([[ 1.20332391, -7.79000064],
       [-4.59152584, -1.25603297],
       [-1.18352557,  1.61185612],
       [-2.0702635 ,  0.85061488],
       [-6.2392547 , -2.36277651],
       [-1.10316914,  1.40987289],
       [-6.18760376, -0.51220793],
       [-5.8397628 , -2.11773283],
       [ 3.21787204, -8.42143222],
       [ 2.55957135, -8.49931821]])

In [None]:
centroids

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

In [None]:
centroids[1,:]

array([0., 0.])

In [None]:
np.mean(X[y == 1] - centroids[1])

-1.681179242137522

In [None]:
centroids = np.zeros([n_clusters, X.shape[1]])
for i in range(n_clusters):
    centroids[i, :] = np.mean(X[y==i,:])

In [None]:
distances = np.zeros([X.shape[0], n_clusters])
distances

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [None]:
for i in range(n_clusters):
    row_norm = np.linalg.norm(X-centroids[i, :], axis=1)
    print(row_norm)
    distances[:, i] = np.square(row_norm)

[4.86626359 6.09534653 5.7713897  3.12791481 6.38040577 3.73175087
 4.8936788  2.83744157 2.90422091 4.86187287]
[4.65893471 6.06235781 5.7580809  3.08627434 6.21231399 3.76089245
 4.79850669 2.80205649 2.90075223 4.95302046]
[3.2820453  6.43996045 6.35134411 3.95746636 5.12324541 5.00266682
 4.74978976 3.81797369 4.11393754 6.49165994]


In [None]:
X-centroids[i, :]

array([[ 2.66894769, -1.91011507],
       [-5.85584277,  2.67996195],
       [-5.96803287,  2.17305214],
       [ 0.40501895, -3.93668637],
       [ 4.13642169, -3.02285612],
       [ 0.27081232, -4.99533142],
       [ 2.28929042, -4.16168861],
       [ 0.13585938, -3.8155557 ],
       [-4.11391248, -0.01435964],
       [-6.4912157 ,  0.07594429]])

In [None]:
distances

array([[23.68052136, 21.70567261, 10.77182133],
       [37.15324931, 36.75218227, 41.47309056],
       [33.30893905, 33.15549567, 40.33957196],
       [ 9.78385106,  9.52508929, 15.66153997],
       [40.70957773, 38.59284514, 26.24764357],
       [13.92596454, 14.14431202, 25.02667535],
       [23.94809215, 23.02566641, 22.56050273],
       [ 8.05107466,  7.85152056, 14.57692309],
       [ 8.43449908,  8.41436347, 16.92448205],
       [23.63780781, 24.53241167, 42.14164882]])