In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import jaccard_score

In [99]:
### First need to load in an image and convert it to three column matrix.
# Each row is a pixel, each column is one of either R, G, and B.

image_filepath = 'homework2_data_code/bruins_banner.bmp'
image = plt.imread(image_filepath)

rows = image.shape[0]
cols = image.shape[1]

# input matrix, pixels:
pixels = np.zeros((rows*cols, 3))
for i in range(rows):
    for j in range(cols):
        pixels[j*rows+i,:] = image[i,j,:]

pixels.shape

(102400, 3)

In [100]:
### Model for how the function should work:
def ex_kmeans(image_data, K):
    kmeans = KMeans(n_clusters=K).fit(image_data)
    label = kmeans.labels_
    centroid = kmeans.cluster_centers_
    return label, centroid

ex_kmeans(pixels, K = 2)

(array([1, 1, 1, ..., 1, 1, 1], dtype=int32),
 array([[123.29370091, 101.22458851,  64.25117185],
        [250.51669391, 247.16776523, 241.97769356]]))

In [87]:
### First create a euclidean distance function
def euclidean(vec1, vec2):
    return np.linalg.norm(vec1-vec2)

### K-means function
def my_kmeans(image_data, K, max_iter = 300):
    
    print('Max Iterations: {}'.format(max_iter))
    
    ### initialize initial assignment
    labels_prev = np.random.randint(0, K, image_data.shape[0])
    centroids = np.array([np.mean(image_data[np.equal(labels_prev, i)], axis=0) for i in range(K)])
    
    
    ### initialize large difference
    difference = 0
    
    ### initialize iteration
    iteration = 1
    
    ### Repeat algorithm until convergence (when jaccard similarity = 1)
    while difference < 1 or iteration == max_iter:
        
        print('Iteration {}'.format(iteration))
        
        # assign each point to the cluster with the nearest centroid
        distances = np.zeros((image_data.shape[0], K))
        for i in range(image_data.shape[0]):
            for j in range(K):
                distances[i, j] = euclidean(centroids[j], image_data[i, :])
                
        # assign each pixel to closer centroid
        labels_new = np.array([np.argmin(centroid) for centroid in distances])
        
        # Calculate new cluster centers
        centroids = np.array([np.mean(image_data[np.equal(labels_new, i)], axis=0) for i in range(K)])
        
        # calculate difference between old cluster centers and new cluster centers
        if K == 2:
            difference = jaccard_score(labels_prev, labels_new)
        else:
            difference = jaccard_score(labels_prev, labels_new, average='macro')
        
        print('Current Jaccard Similarity: {}'.format(difference))
        
        # pass on labels (if we terminate, these will be equal in the end)
        labels_prev = labels_new
        
        # increment iteration
        iteration += 1
    
    ### Check empty clusters and reduce K if they are present
    clusters = np.array([image_data[np.where(labels_new == k)] for k in range(K)])
    empty_clusters = np.array([np.linalg.norm(cluster) == 0 for cluster in clusters])
    if np.sum(empty_clusters > 0):
        K-=1
        print('Found empty clusters. Reducing K to {}'.format(K))
        labels_new, centroids = my_kmeans(image_data, K)
    
    ### return final labels and centroids
    return labels_new, centroids

In [101]:
test = my_kmeans(pixels, K = 2)

Max Iterations: 300
Iteration 1
Current Jaccard Similarity: 0.32347676191739044
Iteration 2
Current Jaccard Similarity: 0.986219662690251
Iteration 3
Current Jaccard Similarity: 0.9934098018769552
Iteration 4
Current Jaccard Similarity: 0.9943947600453458
Iteration 5
Current Jaccard Similarity: 0.997107691008508
Iteration 6
Current Jaccard Similarity: 0.9978615286893924
Iteration 7
Current Jaccard Similarity: 0.99876933522884
Iteration 8
Current Jaccard Similarity: 0.9987678188268786
Iteration 9
Current Jaccard Similarity: 0.9991279007933975
Iteration 10
Current Jaccard Similarity: 0.9995742144256152
Iteration 11
Current Jaccard Similarity: 0.9998509115692622
Iteration 12
Current Jaccard Similarity: 0.9999573969538822
Iteration 13
Current Jaccard Similarity: 1.0


In [102]:
test

(array([0, 0, 0, ..., 0, 0, 0]),
 array([[250.55843266, 247.41015562, 242.52557838],
        [123.84604733, 101.6284217 ,  64.44441131]]))

In [34]:
K = 2

labels = np.random.randint(0, 2, pixels.shape[0])
means = np.array([np.mean(pixels[np.equal(labels, i)], axis=0) for i in range(2)])

# datapoints = pixels[np.where(labels == 0)]

# distances = np.zeros(datapoints.shape[0])

# for i, datapoint in enumerate(datapoints):
#     distances[i] = euclidean(datapoints[i], means[0])
        
# datapoints[np.argmin(distances)]
print(labels)
print(means)

[1 0 0 ... 0 1 1]
[[126.69878714 124.16724889 114.33400322]
 [126.49670064 124.0719925  114.16199289]]


In [35]:

centroids = np.zeros((K, 3))
for k in range(K):
    distances = np.array([euclidean(datapoint, means[k]) for datapoint in pixels])
    centroids[k] = pixels[np.argmin(distances)]

centroids

array([[127., 124., 115.],
       [127., 124., 115.]])

In [94]:
### this looks good; now tackle k-medoids
# should essentially be the same code except cluster
# centers are determined by median point rather than
# the average.

### Solving true median is computationally expensive
### -Instead i'll find the geometric mean, and select the closest datapoint

def my_kmedoids(image_data, K, max_iter = 300):
    print('Max Iterations: {}'.format(max_iter))
    
    ### initialize initial assignment
    labels_prev = np.random.randint(0, K, image_data.shape[0])
    geom_means = np.array([np.mean(image_data[np.equal(labels_prev, i)], axis=0) for i in range(K)])
    
    # find datapoints nearest to each centroid
    centroids = np.zeros((K, 3))
    for k in range(K):
        datapoints = image_data[np.where(labels_prev == k)]
        distances = np.array([euclidean(datapoint, geom_means[k]) for datapoint in image_data])
        centroids[k] = image_data[np.argmin(distances)]
    
    ### initialize large difference
    difference = 0
    
    ### initialize iteration
    iteration = 1
    
    ### Repeat algorithm until convergence (when jaccard similarity = 1)
    while difference < 1 or iteration == max_iter:
        
        print('Iteration {}'.format(iteration))
        
        # assign each point to the cluster with the nearest centroid
        distances = np.zeros((image_data.shape[0], K))
        for i in range(image_data.shape[0]):
            for j in range(K):
                distances[i, j] = euclidean(centroids[j], image_data[i, :])
                
        # assign each pixel to closer centroid
        labels_new = np.array([np.argmin(centroid) for centroid in distances])
        
        # Calculate new cluster centers
        # I will select the datapoint that is nearest to the geometric mean
        geom_means = np.array([np.mean(image_data[np.equal(labels_new, i)], axis=0) for i in range(K)])
        
        centroids = np.zeros((K, 3))
        for k in range(K):
            datapoints = image_data[np.where(labels_new == k)]
            distances = np.array([euclidean(datapoint, geom_means[k]) for datapoint in image_data])
            centroids[k] = image_data[np.argmin(distances)]
        
        # calculate difference between old cluster centers and new cluster centers
        if K == 2:
            difference = jaccard_score(labels_prev, labels_new)
        else:
            difference = jaccard_score(labels_prev, labels_new, average='macro')
        
        print('Current Jaccard Similarity: {}'.format(difference))
        
        # pass on labels (if we terminate, these will be equal in the end)
        labels_prev = labels_new
        
        # increment iteration
        iteration += 1
        
    ### Check empty clusters and reduce K if they are present
    clusters = np.array([image_data[np.where(labels_new == k)] for k in range(K)])
    empty_clusters = np.array([np.linalg.norm(cluster) == 0 for cluster in clusters])
    if np.sum(empty_clusters > 0):
        K-=1
        print('Found empty clusters. Reducing K to {}'.format(K))
        labels_new, centroids = my_kmedoids(image_data, K)
    
    ### return final labels and centroids
    return labels_new, centroids

In [103]:
my_kmedoids(pixels, K = 5)

Max Iterations: 300
Iteration 1


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


Current Jaccard Similarity: 0.06703929503241896
Iteration 2
Current Jaccard Similarity: 0.2851952229502949
Iteration 3
Current Jaccard Similarity: 0.5068841476236638
Iteration 4
Current Jaccard Similarity: 0.5678993572012789
Iteration 5
Current Jaccard Similarity: 0.7485968132182885
Iteration 6
Current Jaccard Similarity: 0.8555628146814469
Iteration 7
Current Jaccard Similarity: 0.8895634151293885
Iteration 8
Current Jaccard Similarity: 0.8962241385926714
Iteration 9
Current Jaccard Similarity: 0.9163645539888903
Iteration 10
Current Jaccard Similarity: 0.918978697455481
Iteration 11
Current Jaccard Similarity: 0.8996334951103624
Iteration 12
Current Jaccard Similarity: 0.8965260535434018
Iteration 13
Current Jaccard Similarity: 0.8999576654465418
Iteration 14
Current Jaccard Similarity: 0.905783449982423
Iteration 15
Current Jaccard Similarity: 0.9162367090254943
Iteration 16
Current Jaccard Similarity: 0.9173803725101097
Iteration 17
Current Jaccard Similarity: 0.9117731367161086
It

(array([1, 1, 1, ..., 1, 1, 1]), array([[ 32.,  30.,  31.],
        [253., 253., 253.],
        [ 82.,  78.,  79.],
        [213., 198., 177.],
        [229., 174.,  74.]]))