# K-Means Algorithm for Image Clustering ( Digit MNIST )

 * Implementing K-Means on a dataset with 5610 samples of numbers 1 to 5
 * Testing Different numbers of K from 3 to 7 to measure its effect on results
 * Implemented by Mohammad Hassan Heydari


In [None]:
import numpy as np
from PIL import Image

* In this part , we read the images from images directory wich are 5 images of each number 1 to 5, each image has 34 * 33 small images stick together , and each small image represents that specific number with 16*16 pixels . we should slice all the images and convert them into numpy.ndarray for our computing mission


In [None]:
def data_redaer():
    dataset = []
    for i in range(5): # five Images

        #opening the image file using PIL.Image
        img = Image.open(f'images/usps_{i+1}.jpg')

        # saving each Image of number into a numpy array
        img_array = np.array(img)

        # here we slice the whole Image into 34*33 small images with 16*16 pixels
        for i in range(0 ,img_array.shape[0] , 16):
            for j in range(0, img_array.shape[1] , 16):

                small_image = img_array[i : i + 16, j : j + 16]

                # here we make our 16*16 image flat as a (1, 256) vector
                small_image = small_image.flatten()

                dataset.append(small_image)

    dataset = np.array(dataset)

    return dataset


* In the part bellow , we assign each member of the dataset inti its nearest centroid

In [None]:

def assign_labels(dataset, centroids):

    distances = np.linalg.norm(dataset[:, np.newaxis] - centroids, axis=2)
    labels = np.argmin(distances, axis= 1)

    return labels

In [None]:
def update_centroids(dataset, labels, K):

    n_features = dataset.shape[1]
    updated_centroids = np.zeros((k, n_features))

    for i in range(K):
        cluster_points = dataset[ labels == i]

        if len(cluster_points) > 0:
            updated_centroids[i] = np.mean(cluster_points, axis=0)

    return updated_centroids

* In the main function of the project, we run K_Means algorithm on our dataset for 100 iterations each

In [None]:
def k_means_image_clustering(dataset, k, init_centroids, max_iters=100):
    centroids = init_centroids

    for i in range(max_iters):
        # for showing the process , we simply add this part of code
        print(f'Epoch : {i} | K = {k}')

        labels = assign_labels(dataset, centroids)
        updated_centroids = update_centroids(dataset, labels, k)

        centroids = updated_centroids

    return centroids, labels

* In our main part of project, we initialize K centroids and then run K_Means with each centroids to measure how does it effect our performance , then we save the centroids as images into their specific directory

In [None]:
dataset = data_redaer()

for k in [3, 4, 5, 6, 7] :

    init_centroids = dataset[ np.random.choice(len(dataset), k) ]
    # Run k-means clustering
    centroids, labels = k_means_image_clustering(dataset, k, init_centroids= init_centroids)

    # Reshape the centroids into images
    centroids = centroids.reshape((k, 16, 16))

    for i in range(k):

        image_data = np.uint8(centroids[i])
        image = Image.fromarray(image_data)

        image.save(f"centroids/{k}/centroid{i + 1}.png")