In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [16]:
def choose_initial_cluster_centers(points, clusters):
    us = np.zeros((clusters,784))         # centers of each cluster
    indicies = np.zeros(clusters)

    index = np.random.choice(points.shape[0], 1)[0]         # choose first point randomly
    indicies[0] = index
    us[0] =  points[index]

    for i in range(1,clusters):
        max_distance = 0
        for j in range(points.shape[0]):    # choose the center with the biggest distancce form last center
            if j not in indicies:           # check if the the point is already a center to cluster
                point = np.array(points[j])
                last_point = us[i-1]
                distance = np.linalg.norm(point-last_point)     # Eucledian distance

                if distance > max_distance:
                    max_distance = distance
                    index = j
                    u = point
        
        
        indicies[i] = index
        us[i] = u

    return us

In [25]:
def compute_membership(centers, point, rnks):
    min_distance = np.linalg.norm(point-centers[0])

    index = 0
    for i in range(1,centers.shape[0]):
        
        center = centers[i]
        distance = np.linalg.norm(point-center)

        if distance < min_distance and distance > 0:
            min_distance = distance
            index = i
        elif distance == min_distance and distance > 0:      # if the distances are the same between two centers, choose the one with more members
            if rnks[i] > rnks[index]:
                index = i

    return index, min_distance

In [30]:
def kmeans(points, clusters):

    old_centers = np.zeros((clusters,784))
    centers = choose_initial_cluster_centers(points, clusters)
    points_sum = np.zeros((10,784))
    distances = np.zeros(clusters)  # sum of distances in each cluster from the points to the cluster centers

    
    for limit in range(1000):
        
        if (centers == old_centers).all():      # stop if the algorithm converges
            break

        print("Iteration Number : ")
        print(limit)

        points_sum = np.zeros((10,784))
        rnks = np.zeros(clusters)       # used for counting number of members in each cluster
        distances = np.zeros(clusters)  # sum of distances in each cluster from the points to the cluster centers
        old_centers = centers

        for i in range(points.shape[0]):
            point = points[i]
            rnk, distance = compute_membership(centers, point, rnks)    # returns the cluster number of the point and its distance from the mean of the cluster
            rnks[rnk] += 1
            distances[rnk] += distance
            points_sum[rnk] += point
        
        for i in range(clusters): 
            centers[i] = points_sum[i] / rnks[i]         # Adjust the cluster centers

    sum_distances = np.sum(distances)
    
    return centers, sum_distances, rnks

In [7]:
images = []

for i in range(2400):
    img = plt.imread("Images/" + str((i+1)) + ".jpg")
    pixels = []
    for j in range(28):
        for k in range(28):
            pixels.append(img[j,k])
    
    images.append(pixels)
    
images = np.asarray(images)
print(images.shape)

(2400, 784)


In [8]:
imagesT = np.transpose(images)
print(imagesT.shape)

(784, 2400)


In [9]:
labels = np.genfromtxt('Images/Training Labels.txt')
print(labels)

[0. 0. 0. ... 9. 9. 9.]


In [32]:
centers_arr = np.zeros((30,10,784))
distances_arr = np.zeros(30)
rnks = np.zeros((30,10))

for i in range(30):
    centers_arr[i], distances_arr[i], rnks[i] = kmeans(images,10)
    
print(rnks)

Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
Iteration Number : 
0
[[ 806.   19.  466.    6.  200.   59.  262.   52.  140.  390.]
 [ 198.    7.  197.   76.  379.  238.  131.   31. 1049.   94.]
 [1455.   67.  118.   20.  111.  103.  102.   41.  366.   17.]
 [ 444.   49. 1342.   37.   92.   15.  114.  128.  154.   25.]
 [ 466.    6.  194.   69.  322.  177.  117.   14.  977.   58.]
 [ 911.   19.  351.    6.