In [2]:
from __future__ import print_function
import numpy as np
from scipy.spatial.distance import cdist
import random

np.random.seed(18)

means = [[2, 2], [8, 3], [3, 6]]
cov = [[1, 0], [0, 1]]
N = 500

X0 = np.random.multivariate_normal(means[0], cov, N)
X1 = np.random.multivariate_normal(means[1], cov, N)
X2 = np.random.multivariate_normal(means[2], cov, N)

X = np.concatenate((X0, X1, X2), axis = 0)
K = 3
original_label = np.asarray([0]*N + [1]*N + [2]*N).T

In [6]:
# X0
# X
original_label

array([0, 0, 0, ..., 2, 2, 2])

Functions for K-means clustering
1. **kmeans_init_centroids** -> Initialize the first centroids
2. **kmeans_asign_labels** -> Find new labels for data when centroids are set
3. **kmeans_update_centroids** -> Update centroids when we know the label of each data
4. **has_converged** -> To check the stop condition

In [7]:
def kmeans_init_centroids (X, k):
    # randomly select k rows of X as initial centroids
    return X[np.random.choice(X.shape[0], k, replace = False)]

def kmeans_assign_labels(X, centroids):
    # calculate pairwise distances btw data and centroids
    D = cdist(X, centroids)
    # return index of the closest centroid
    return np.argmin(D, axis=1)

def has_converged(centroids, new_centroids):
    # return True if two sets of centroids are the same
    return (set([tuple(a) for a in centroids])) == set([tuple(a) for a in new_centroids])

def kmeans_update_centroids(X, labels, K):
    centroids = np.zeros((K,  X.shape[1]))
    
    for k in range(K):
        # collect all points that are assigned to the k-th cluster
        Xk = X[labels == k, :]
        centroids[k, :] = np.mean(Xk, axis=0)   # then take avg
        
    return centroids

K-means clustering

In [8]:
def kmeans(X, K):
    centroids = [kmeans_init_centroids(X, K)]
    labels = []
    it = 0
    while True:
        labels.append(kmeans_assign_labels(X, centroids[-1]))
        new_centroids = kmeans_update_centroids(X, labels[-1], K)
        if has_converged(centroids[-1], new_centroids):
            break
        centroids.append(new_centroids)
        it+=1
    return (centroids, labels, it)


In [10]:
(centroids, labels, it) = kmeans(X, K)

print('Centers found by our algorithm:\n', centroids[-1]) 

Centers found by our algorithm:
 [[1.9834967  1.96588127]
 [3.02702878 5.95686115]
 [8.07476866 3.01494931]]


# Using libs

In [11]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3, random_state=0).fit(X)

print('Center found:')
print(model.cluster_centers_)

pred_labels = model.predict(X)



Center found:
[[8.07476866 3.01494931]
 [3.02429957 5.95334038]
 [1.98417154 1.96141961]]


# Classify hand writing (shoud do on colab)

In [None]:
from __future__ import print_function 
import numpy as np 
from sklearn.datasets import fetch_openml

data_dir = 'Self-Learning/Data'
# path to your data folder 
mnist = fetch_openml('mnist-original', data_home=data_dir) 
print("Shape of minst data:", mnist.data.shape)