# Unsupervised Learning: K-Means Clustering

In this notebook you will learn how to manually code a k-means clustering algorithm. The example is from the documentation of SciKitLearn package available at: https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [None]:
# Load iris dataset from the package:
iris_dataset = datasets.load_iris()

# attributes of 'iris_dataset':
print(iris_dataset.keys())

In [None]:
# Load data features (only the first two):
X = iris_dataset['data'][:,[0,1]]
feature_names = iris_dataset['feature_names'][:2]
print('Data features shape: ', X.shape)
print('Data feature names: ', feature_names)

print('----------------------------------')
# Load ground-truth labels:
y = iris_dataset['target']
label_names = iris_dataset['target_names']
print('Data GT labels shape: ', y.shape)
print('Data unique labels: ', np.unique(y))
print('Data label names: ', label_names)

In [None]:
# Plot data and GT clusters:
y_cl = np.choose(y, [2, 0, 1]).astype(np.float)
plt.figure(figsize=[12,5])
plt.subplot(1,2,1)
plt.scatter(X[:, 0], X[:, 1], edgecolor='k')
plt.title('Data without clustering')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.subplot(1,2,2)
plt.scatter(X[:, 0], X[:, 1], c=y_cl, edgecolor='k')
plt.title('Data with GT clustering')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.show()

In [None]:
# Set numpy seed
np.random.seed(200)
# Try generating random numbers:
print(np.random.choice(50,5))

In [None]:
# Choose number of centroids:
k = 3

# Initialize means with features similar to data:
centroids = np.zeros((k, X.shape[-1]))
for dim in range(X.shape[1]):
    centroids[:,dim] = np.random.choice(X[:,dim], k)
    
print('centroids matrix:\n', centroids, '\ncentroids shape: ', centroids.shape)



In [None]:
# Loop n times to find best cluster centroids
n = 10
for i in range(n):
    # expand dimensions of X and centroids to enable sutraction:
    X_dists = np.expand_dims(X, axis=1)-np.expand_dims(centroids, axis=0)
    # calculate distances from centroids:
    X_dists_euc = np.sqrt(np.sum(X_dists**2, axis=-1))
    # assign a cluster to each data point
    samples_assignments = np.argmin(X_dists_euc, axis=-1)
    # calculate new cluster centroids:
    for ki in range(k):
        centroids[ki,:] = np.mean(X[samples_assignments==ki,:], axis=0)
        
    # Plot data before and after clustering in two separate plots:
    plt.figure(figsize=[12,5])
    plt.subplot(121)
    plt.title('Data before clustering')
    plt.xlabel(feature_names[0])
    plt.ylabel(feature_names[1])
    plt.scatter(X[:, 0], X[:, 1])
    plt.subplot(122)
    plt.title('Data after iteration '+str(i+1))
    plt.xlabel(feature_names[0])
    plt.scatter(X[:, 0], X[:, 1], c=samples_assignments.astype(np.float))
    plt.scatter(centroids[:, 0], centroids[:, 1],
                c=np.arange(centroids.shape[0]).astype(np.float), edgecolor='k', marker='X', s=100)
    plt.show()

In this exercise you will write a function that takes in a feature matrix (X), the parameter (k) and number of iterations (n) to run the K-means algorithm and cluster the data..

Edit the file 'Kmeans.py' and then run the following cell:

In [None]:
from Kmeans import run_kmeans

centroids = run_kmeans(X, k=4, n=5)

Kmeans clustering using SciKit-Learn:

In [None]:
from sklearn.cluster import KMeans

# Create kmeans clustering estimator:
kmeans_estimator = KMeans(n_clusters=3)

# Feed data into the estimator:
kmeans_estimator.fit(X)
labels = kmeans_estimator.labels_

# Plot data with labels from the estiator:
labels = np.choose(labels, [0, 1, 2]).astype(np.float)
plt.figure(figsize=[5,5])
plt.scatter(X[:, 0], X[:, 1], c=labels, edgecolor='k')
plt.title('sklearn kmeans clustering')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.show()