In [37]:
%matplotlib inline
import random
from base64 import b64decode
from json import loads
import numpy as np
import matplotlib.pyplot as plt

In [38]:
def read_in_data(json_line):
    json_object = loads(json_line)
    array = np.fromstring(b64decode(json_object["data"]),  dtype=np.ubyte)
    return (json_object["label"], array)

In [56]:
with open("digits.base64.json", "r") as f:
    digits = list(map(read_in_data, f.readlines()))

# pick a ratio for splitting the digits list
# into a training and a validation set.
training_size = int(len(digits)*0.25)
validation = digits[:training_size]
training = digits[training_size:]

  This is separate from the ipykernel package so we can avoid doing imports until


60000


In [40]:
def display_digit(digit, labeled = True, title = ""):
    if labeled:
        digit = digit[1]
    image = digit
    plt.figure()
    fig = plt.imshow(image.reshape(28, 28))
    fig.set_cmap('gray_r')
    fig.axes.get_xaxis().set_visiable(False)
    fig.axes.get_yaxis().set_visiable(False)
    if title != "":
        plt.title("Inferred label: " + str(title))

In [76]:
def init_centroids(labelled_data, k):
    return map(lambda x: x[1], random.sample(labelled_data, k))

def sum_cluster(labelled_cluster):
    sum_ = labelled_cluster[0][1].copy()
    for (label, vector) in labelled_cluster[1:]:
        sum_ += vector
    return sum_

def mean_cluster(labelled_cluster):
    sum_of_points = sum_cluster(labelled_cluster)
    mean_of_points = sum_of_points * (1.0 / len(labelled_cluster))
    return mean_of_points

In [88]:
def from_clusters(labelled_data, unlabelled_centroids):
    centroids_indices = rande(len(unlabelled_centroids))
    clusters = {c: [] for c in centroids_indices}
    
    for (label, Xi) in labelled_data:
        smallest_distance = float("inf")
        for cj_index in centroids_indices:
            cj = unlabelled_centroids[cj_index]
            distance = np.linalg.norm(Xi - cj)
            if distance < smallest_distance:
                closest_centroid_index = cj_index
                smallest_distance = distance
        clusters[closest_centroid_index].append((label, Xi))
    return clusters.values()

def move_centroids(labelled_data):
    return [mean(cluster for cluster in labelled_data]

In [89]:
def repeat_until_convergence(labelled_data, labelled_clusters, unlabelled_centroids):
    previous_max_difference = 0
    while True:
        unlabelled_old_centroids = unlabelled_centroids
        unlabelled_centroids = move_centroids(labelled_clusters)
        labelled_clusters = from_clusters(labelled_data, unlabelled_centroids)
        differences = map(lambda a, b: np.linalg.norm(a-b), unlabelled_old_centroids, unlabelled_centroids)

        max_difference = max(list(differences))
        difference_change = abs((max_difference-previous_max_difference)/np.mean([previous_max_difference, max_difference])) * 100
        previous_max_difference = max_difference
        if np.isnan(difference_change):
            break
    return labelled_clusters, unlabelled_centroids

In [90]:
def cluster(labelled_data, k):
    centroids = init_centroids(labelled_data, k)
    print(len(list(centroids)))

    clusters = from_clusters(labelled_data, list(centroids))
    final_clusters, final_centroids = repeat_until_convergence(labelled_data, clusters, centroids)
    return final_clusters, final_centroids

In [91]:
def assign_labels_to_centroids(clusters, centroids):
    labelled_centroids = []
    for i in range(len(clusters)):
        labels = map(lambda x: x[0], clusters[i])
        most_common = max(set(labels), key=labels.count)
        centroid = (most_common, centroids[i])
        labelled_centroids.append(centroid)
    return labelled_centroids

In [92]:
def classify_digit(digit, labelled_centroids):
    mindistance = float("inf")
    for (label, centroid) in labelled_centroids:
        distance = np.linalg.norm(centroid - digit)
        if distance < mindstance:
            mindstance = distance
            closest_centroid_label = label
    return closest_centroid_label

def get_error_rate(labelled_digits, labelled_centroids):
    classified_incorrect = 0
    for (label, digit) in labelled_digits:
        classified_label = classify_digit(digit, labelled_centroids)
        if classified_label != label:
            classified_incorrect += 1
    error_rate = classified_incorrect / float(len(digits))
    return error_rate

In [93]:
k = 16
clusters, centroids = cluster(training, k)
labelled_centroids = assign_labels_to_centroids(clusters, centroids)

for (label, digit) in labelled_centroids:
    display_digit(digit, labeled=False, title=label)

16


KeyError: 1