In [2]:
import numpy as np

In [19]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
import numpy as np
import time


def toy_dataset(total_clusters=2, sample_per_cluster=50):
    # TODO: add sample size ease
    np.random.seed(int(time.time()))
    N = total_clusters * sample_per_cluster
    y = np.zeros(N)
    np.random.seed(43)
    x = np.random.standard_normal(size=(N, 2))
    for i in range(total_clusters):
        theta = 2*np.pi*i / total_clusters
        x[i*sample_per_cluster:(i+1)*sample_per_cluster] = x[i*sample_per_cluster:(i+1)*sample_per_cluster] + \
            (total_clusters * np.cos(theta), total_clusters * np.sin(theta))
        y[i*sample_per_cluster:(i+1)*sample_per_cluster] = i

    
    return x, y


def load_digits():
    digits = datasets.load_digits()
    x = digits.data/16
    x = x.reshape([x.shape[0], -1])
    y = digits.target
    
    return train_test_split(x, y, random_state=42, test_size=0.25)

In [20]:
samples_per_cluster = 50
n_cluster = 9
x, y = toy_dataset(n_cluster, samples_per_cluster)

In [39]:
def get_k_means_plus_plus_center_indices(n, n_cluster, x, generator=np.random):
    first_center = x[generator.randint(0, n - 1)]
    centers = [first_center]
    for i in range(n_cluster - 1):
        distance = []
        for point in x:
            local_min = float('inf')
            for center in centers:
                local_min = min(np.sqrt(np.sum(np.power((point - center), 2))), local_min)
            distance.append(local_min)
        centers.append(x[np.argmax(distance)])
    return centers

In [43]:
a = get_k_means_plus_plus_center_indices(len(x), n_cluster, x)

In [53]:
a

[array([-8.60519634, -3.9717283 ]),
 array([9.51184262, 7.29385686]),
 array([ 6.74524742, -7.82711022]),
 array([-4.60302021,  9.29916152]),
 array([11.08905957, -1.01270925]),
 array([ -0.6433554 , -10.48227706]),
 array([-9.71095466,  3.62798132]),
 array([2.51166286, 9.62309962]),
 array([4.78520657, 4.26597288])]

In [69]:
b = x[np.random.choice(500, 5, replace=False)]
b = np.expand_dims(b, axis=1)
b

array([[[-9.0373166 ,  4.0837211 ]],

       [[ 0.39073705,  7.10299333]],

       [[ 8.9098984 ,  4.75364035]],

       [[ 7.02423461,  5.71985062]],

       [[ 9.39490664,  0.83720502]]])

In [70]:
c = a -b

In [74]:
c = np.power(c, 2)

In [75]:
d = np.sum(c, axis=2)

In [76]:
d

array([[ 65.07699297, 354.37627941, 390.95722978,  46.86380324,
        431.04462031, 282.62688687,   0.66148697, 164.06364112,
        191.09536261],
       [203.57627673,  83.2309956 , 263.28779383,  29.76076632,
        180.31873311, 310.31108163, 114.11988386,  10.84926203,
         27.36004734],
       [382.91060201,   6.81503674, 162.96099882, 203.2607321 ,
         37.99953121, 323.39783763, 348.00327701,  64.64905161,
         17.25090228],
       [338.2058139 ,   8.66568923, 183.59798177, 148.00452125,
         61.8501644 , 321.3008779 , 284.44247715,  35.5986566 ,
          7.12700705],
       [347.12954685,  41.70202699,  82.09105255, 267.5466637 ,
          6.292337  , 228.89737932, 372.82236819, 124.57098877,
         33.00578373]])

In [77]:
np.argmin(d, axis=0)

array([0, 2, 4, 1, 4, 4, 0, 1, 3], dtype=int64)

In [79]:
def get_lloyd_k_means(n, n_cluster, x, generator):
    return generator.choice(n, size=n_cluster)

In [80]:
class KMeans():

    '''
        Class KMeans:
        Attr:
            n_cluster - Number of cluster for kmeans clustering (Int)
            max_iter - maximum updates for kmeans clustering (Int)
            e - error tolerance (Float)
            generator - random number generator from 0 to n for choosing the first cluster at random
            The default is np.random here but in grading, to calculate deterministic results,
            We will be using our own random number generator.
    '''
    def __init__(self, n_cluster, max_iter=100, e=0.0001, generator=np.random):
        self.n_cluster = n_cluster
        self.max_iter = max_iter
        self.e = e
        self.generator = generator

    def fit(self, x, centroid_func=get_lloyd_k_means):

        '''
            Finds n_cluster in the data x
            params:
                x - N X D numpy array
                centroid_func - To specify which algorithm we are using to compute the centers(Lloyd(regular) or Kmeans++)
            returns:
                A tuple
                (centroids a n_cluster X D numpy array, y a length (N,) numpy array where cell i is the ith sample's assigned cluster, number_of_updates a Int)
            Note: Number of iterations is the number of time you update the assignment
        '''
        assert len(x.shape) == 2, "fit function takes 2-D numpy arrays as input"
        
        N, D = x.shape

        self.centers = centroid_func(len(x), self.n_cluster, x, self.generator)

        # TODO:
        # - comment/remove the exception.
        # - Initialize means by picking self.n_cluster from N data points
        # - Update means and membership until convergence or until you have made self.max_iter updates.
        # - return (means, membership, number_of_updates)

        # DONOT CHANGE CODE ABOVE THIS LINE
            
        r = np.zeros(N, dtype=int)
        J = np.inf
        
        for i in range(max_iter):
            l2 = np.sum( np.power((x-np.expand_dims(self.centers, axis=1)), 2),  axis=2)
            r = np.argmin(l2, axis=0)
            # calculate distortion
            J_new = np.sum([np.sum((x[r == k] - self.centers[k]) ** 2) for k in range(self.n_cluster)]) / N
            
            if np.abs(J - J_new) < self.e:
                break
            J = J_new
            # update mean
            self.centers = [np.mean(x[r == k], axis=0) for k in range(self.n_cluster)]        
        
        
        centroids = self.centers
        y = [  x[r == k] for k in range(self.n_cluster)]
        self.max_iter = i
        
        # DO NOT CHANGE CODE BELOW THIS LINE
        return centroids, y, self.max_iter




    

In [None]:
class KMeansClassifier():

    '''
        Class KMeansClassifier:
        Attr:
            n_cluster - Number of cluster for kmeans clustering (Int)
            max_iter - maximum updates for kmeans clustering (Int)
            e - error tolerance (Float)
            generator - random number generator from 0 to n for choosing the first cluster at random
            The default is np.random here but in grading, to calculate deterministic results,
            We will be using our own random number generator.
    '''

    def __init__(self, n_cluster, max_iter=100, e=1e-6, generator=np.random):
        self.n_cluster = n_cluster
        self.max_iter = max_iter
        self.e = e
        self.generator = generator


    def fit(self, x, y, centroid_func=get_lloyd_k_means):
        '''
            Train the classifier
            params:
                x - N X D size  numpy array
                y - (N,) size numpy array of labels
                centroid_func - To specify which algorithm we are using to compute the centers(Lloyd(regular) or Kmeans++)

            returns:
                None
            Stores following attributes:
                self.centroids : centroids obtained by kmeans clustering (n_cluster X D numpy array)
                self.centroid_labels : labels of each centroid obtained by
                    majority voting (N,) numpy array)
        '''

        assert len(x.shape) == 2, "x should be a 2-D numpy array"
        assert len(y.shape) == 1, "y should be a 1-D numpy array"
        assert y.shape[0] == x.shape[0], "y and x should have same rows"

        self.generator.seed(42)
        N, D = x.shape
        # TODO:
        # - comment/remove the exception.
        # - Implement the classifier
        # - assign means to centroids
        # - assign labels to centroid_labels

        # DONOT CHANGE CODE ABOVE THIS LINE
        raise Exception(
             'Implement fit function in KMeansClassifier class')

        

        
        # DONOT CHANGE CODE BELOW THIS LINE

        self.centroid_labels = centroid_labels
        self.centroids = centroids

        assert self.centroid_labels.shape == (
            self.n_cluster,), 'centroid_labels should be a numpy array of shape ({},)'.format(self.n_cluster)

        assert self.centroids.shape == (
            self.n_cluster, D), 'centroid should be a numpy array of shape {} X {}'.format(self.n_cluster, D)

    def predict(self, x):
        '''
            Predict function
            params:
                x - N X D size  numpy array
            returns:
                predicted labels - numpy array of size (N,)
        '''

        assert len(x.shape) == 2, "x should be a 2-D numpy array"

        self.generator.seed(42)
        N, D = x.shape
        # TODO:
        # - comment/remove the exception.
        # - Implement the prediction algorithm
        # - return labels

        # DONOT CHANGE CODE ABOVE THIS LINE
        raise Exception(
             'Implement predict function in KMeansClassifier class')
        
        
        # DO NOT CHANGE CODE BELOW THIS LINE
        return np.array(labels)

In [87]:
import numpy as np
from data_loader import toy_dataset, load_digits
from kmeans import KMeans, KMeansClassifier, get_k_means_plus_plus_center_indices as k_plus, get_lloyd_k_means as k_vanilla, transform_image
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from utils import Figure
from sklearn.metrics import mean_squared_error

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [85]:
def kmeans_toy():
    print("[+] K-Means on Toy Dataset")

    print("[+] K-Means Vanilla")
    kmeans_builder(k_vanilla)
    print()

    print("[+] K-Means Plus Plus")
    kmeans_builder(k_plus)
    print()


def kmeans_builder(centroid_func):
    samples_per_cluster = 50
    n_cluster = 9

    x, y = toy_dataset(n_cluster, samples_per_cluster)
    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=y)
    fig.savefig('plots/toy_dataset_real_labels.png')

    fig.ax.scatter(x[:, 0], x[:, 1])
    fig.savefig('plots/toy_dataset.png')

    k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8)

    centroids, membership, i = k_means.fit(x, centroid_func)



    assert centroids.shape == (n_cluster, 2), \
        ('centroids for toy dataset should be numpy array of size {} X 2'
            .format(n_cluster))

    assert membership.shape == (samples_per_cluster * n_cluster,), \
        'membership for toy dataset should be a vector of size {}'.format(len(membership))

    assert type(i) == int and i > 0,  \
        'Number of updates for toy datasets should be integer and positive'

    print('[success] : kmeans clustering done on toy dataset')
    print('Toy dataset K means clustering converged in {} steps'.format(i))

    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=membership)
    fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red')
    fig.savefig('plots/toy_dataset_predicted_labels.png')



################################################################################
# KMeans for image compression
# Here we use k-means for compressing an image
# We load an image 'baboon.tiff',  scale it to [0,1] and compress it.
# The problem can be rephrased as --- "each pixel is a 3-D data point (RGB) and we want to map each point to N points or N clusters.
################################################################################


def kmeans_image_compression():

    print("[+] K-Means Image Compression")
    im = plt.imread('baboon.tiff')
    N, M = im.shape[:2]
    im = im / 255

    # convert to RGB array
    data = im.reshape(N * M, 3)

    k_means = KMeans(n_cluster=16, max_iter=100, e=1e-6)
    centroids, _, i = k_means.fit(data)

    print('[+] RGB centroids computed in {} iteration'.format(i))
    new_im = transform_image(im, centroids)

    assert new_im.shape == im.shape, \
        'Shape of transformed image should be same as image'

    mse = np.sum((im - new_im)**2) / (N * M)
    print('[+] Mean square error per pixel is {}\n'.format(mse))
    plt.imsave('plots/compressed_baboon.png', new_im)




################################################################################
# Kmeans for classification
# Here we use k-means for classifying digits
# We find N clusters in the data and label each cluster with the maximal class that belongs to that cluster.
# Test samples are labeled based on which cluster they belong to
################################################################################


def kmeans_classification():
    print("[+] K-Means Classification")

    x_train, x_test, y_train, y_test = load_digits()

    print("[+] K-Means Vanilla")
    kmeans_classification_builder(k_vanilla, x_train, x_test, y_train, y_test)
    print()

    print("[+] K-Means Plus Plus")
    kmeans_classification_builder(k_plus, x_train, x_test, y_train, y_test)

    linear_classifier = LogisticRegression()
    linear_classifier.fit(x_train, y_train)
    y_hat_test = linear_classifier.predict(x_test)
    print('[*] Accuracy of logistic regression classifier is {}'
          .format(np.mean(y_hat_test == y_test)))

    KNNClassifier = KNeighborsClassifier()
    KNNClassifier.fit(x_train, y_train)
    y_hat_test = KNNClassifier.predict(x_test)
    print('[*] Accuracy of Nearest Neighbour classifier is {}'
          .format(np.mean(y_hat_test == y_test)))


def kmeans_classification_builder(centroid_func, x_train, x_test, y_train, y_test):

    # plot some train data
    N = 25
    l = int(np.ceil(np.sqrt(N)))

    im = np.zeros((10 * l, 10 * l))
    for m in range(l):
        for n in range(l):
            if (m * l + n < N):
                im[10 * m:10 * m + 8, 10 * n:10 * n +
                    8] = x_train[m * l + n].reshape([8, 8])
    plt.imsave('plots/digits.png', im, cmap='Greys')

    n_cluster = 10
    classifier = KMeansClassifier(n_cluster=n_cluster, max_iter=100, e=1e-6)

    classifier.fit(x_train, y_train, centroid_func)
    y_hat_test = classifier.predict(x_test)

    assert y_hat_test.shape == y_test.shape, \
        'y_hat_test and y_test should have same shape'

    print('[*] Prediction accuracy of K-means classifier with {} cluster is {}'.
          format(n_cluster, np.mean(y_hat_test == y_test)))

In [86]:
kmeans_toy()
kmeans_image_compression()
kmeans_classification()

[+] K-Means on Toy Dataset
[+] K-Means Vanilla


Exception: Implement fit function in KMeans class