## K-Means Clustering

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from utils import *
from public_tests import *

The `K-means` algorithm is a method to automatically cluster similar
data points together into few cohesive `clusters`.

In [None]:
def find_closest_centroids(x, centroids):
  idx=np.zeros(x.shape[0], dtype=int)

  for i in range(x.shape[0]):
    distance=[]
    for j in range(centroids.shape[0]):
      # To calculate Euclidean distance
      dist_ij=np.linalg.norm(x[i]-centroids[j])
      distance.append(dist_ij)

    idx[i]=np.argmin(distance)

  return idx

In [None]:
x_train=load_data()

In [None]:
print(f"First 5 elements of x_train:\n{x_train[:5]}")

In [None]:
print(x_train.shape)

In [None]:
initial_centroids=np.array([
  [3, 3],
  [6, 2],
  [8, 5]
])

In [None]:
idx=find_closest_centroids(x_train, initial_centroids)

In [None]:
print(f"First 3 elements of idx: {idx[:3]}")

In [None]:
find_closest_centroids_test(find_closest_centroids)

In [None]:
def compute_centroids(x, idx, K):
  m, n=x.shape
  computed_centroids=np.zeros((K, n))

  for k in range(K):
    points=x[idx==k]
    computed_centroids[k]=np.mean(points, axis=0)

  return computed_centroids

In [None]:
K=initial_centroids.shape[0]
centroids=compute_centroids(x_train, idx, K)

print(f"Recomputed centroids are:\n{centroids}")

In [None]:
compute_centroids_test(compute_centroids)

### K-Means on a Sample Dataset

In [None]:
def run_k_means(x, initial_centroids, max_iters=10, plot_progress=False):
  m, n=x.shape
  K=initial_centroids.shape[0]
  centroids=initial_centroids
  previous_centroids=centroids
  idx=np.zeros(m)

  for i in range(max_iters):
    print(f"K-Means iteration: {i}/{max_iters-1}")

    idx=find_closest_centroids(x, centroids)

    if plot_progress:
      plot_progress_kMeans(x, centroids, previous_centroids, idx, K, i)
      previous_centroids=centroids

    centroids=compute_centroids(x, idx, K)

  plt.show()
  return centroids, idx

In [None]:
max_iters=10

centroids, idx=run_k_means(x_train, initial_centroids, max_iters, plot_progress=True)

### Random Initialization

In [None]:
def k_means_init_centroids(x, K):
  # Randomly reorder the indices of the dataset
  rand_idx=np.random.permutation(x.shape[0])
  centroids=x[rand_idx[:K]]

  return centroids

In [None]:
K=3
init_centroids=k_means_init_centroids(x_train, K)

print(init_centroids)

### Image Compression with K-Means

In [None]:
img=plt.imread("data/bird_small.png")
plt.imshow(img)

In [None]:
print(img.shape)

In [None]:
img=img/255 # To make all values lie in the range 0 to 1

# Conversion of 3D image to a 2D vector
x_img=np.reshape(img, (img.shape[0]*img.shape[1], 3))

In [None]:
# We have to reduce the colors in the image to 16

K=16
max_iters=10

initial_centroids=k_means_init_centroids(x_img, K)

centroids, idx=run_k_means(x_img, initial_centroids, max_iters, plot_progress=True)

In [None]:
print(idx.shape)

### Compressing the Image

In [None]:
# Represent Image in terms of indices
x_recovered=centroids[idx, :]
x_recovered=np.reshape(x_recovered, img.shape)

In [None]:
fig, ax=plt.subplots(1, 2, figsize=(8, 8))
plt.axis('off')

ax[0].imshow(img*255)
ax[0].set_title("Original Image")
ax[0].set_axis_off()

ax[1].imshow(x_recovered*255)
ax[1].set_title(f"Compressed with {K} colours")
ax[1].set_axis_off()