In [3]:
from linearalgebra import Vector

In [4]:
def num_differences(v1: Vector, v2: Vector) -> int:
    assert len(v1) == len(v2)
    return len([x1 for x1, x2 in zip(v1, v2) if x1 != x2])

In [5]:
from typing import List
from linearalgebra import vector_mean

In [19]:
def cluster_means(k: int,
                  inputs: List[Vector],
                  assignments: List[int]) -> List[Vector]:
                  clusters = [[] for i in range(k)]
                  for input, assignment in zip(inputs, assignments):
                      clusters[assignment].append(input)
                   
                  return [vector_mean(cluster) if cluster else random.choice(inputs) for cluster in clusters]

In [20]:
import itertools
import random
import tqdm
from linearalgebra import squared_distance

In [21]:
class Kmeans:
    def __init__(self, k: int) -> None:
        self.k = k
        self.means = None
    
    def classify(self, input: Vector) -> int:
        '''return the index of the cluster closest to the input'''
        return min(range(self.k), key=lambda i: squared_distance(input, self.means[i]))

    def train(self, inputs: List[Vector]) -> None:
        assignments = [random.randrange(self.k) for _ in inputs]

        with tqdm.tqdm(itertools.count()) as t:
            for _ in t:
                self.means = cluster_means(self.k, inputs, assignments)
                new_assignments = [self.classify(input) for input in inputs]
                num_changed = num_differences(assignments, new_assignments) 
                if num_changed == 0:
                    return 
                assignments = new_assignments
                self.means = cluster_means(self.k, inputs, assignments)
                t.set_description(f"change: {num_changed} / {len(inputs)}")
                


In [22]:
inputs: List[List[float]] = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]

In [23]:
random.seed(12)
clusterer = Kmeans(k=3)
clusterer.train(inputs)
means = sorted(clusterer.means)

change: 5 / 20: : 1it [00:00, 334.34it/s]


In [25]:
squared_distance(means[0], [-44, 5])

0.19999999999999857