In [None]:
import random
import math
from typing import List, Tuple

Point = List[float]

def euclidean(a: Point, b: Point) -> float:
    return math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b)))

def centroid(points: List[Point]) -> Point:
    dim = len(points[0])
    return [sum(p[d] for p in points) / len(points) for d in range(dim)]

def kmeans_pp_init(data: List[Point], k: int) -> List[Point]:
    centers = [random.choice(data)]
    while len(centers) < k:
        # Compute distance^2 to nearest existing center
        d2 = []
        for p in data:
            m = min(euclidean(p, c) ** 2 for c in centers)
            d2.append(m)
        total = sum(d2)
        r = random.random() * total
        acc = 0.0
        for p, w in zip(data, d2):
            acc += w
            if acc >= r:
                centers.append(p)
                break
    return centers

def kmeans(
    data: List[Point],
    k: int,
    max_iters: int = 100,
    tol: float = 1e-4,
    init: str = "kmeans++",
    seed: int = 42,
) -> Tuple[List[int], List[Point]]:
    if k <= 0 or k > len(data):
        raise ValueError("k must be in [1, len(data)]")
    random.seed(seed)

    # Initialize centers
    if init == "kmeans++":
        centers = kmeans_pp_init(data, k)
    elif init == "random":
        centers = random.sample(data, k)
    else:
        raise ValueError("init must be 'kmeans++' or 'random'")

    labels = [0] * len(data)

    for _ in range(max_iters):
        # Assignment step
        changed = False
        for i, p in enumerate(data):
            j = min(range(k), key=lambda c: euclidean(p, centers[c]))
            if labels[i] != j:
                labels[i] = j
                changed = True

        # Update step
        new_centers = [[] for _ in range(k)]
        for lbl, p in zip(labels, data):
            new_centers[lbl].append(p)

        moved = 0.0
        for c in range(k):
            if new_centers[c]:  # normal case
                new_c = centroid(new_centers[c])
            else:  # empty cluster: re-seed to a random point
                new_c = random.choice(data)
            moved += euclidean(centers[c], new_c)
            centers[c] = new_c

        # Convergence checks
        if not changed:
            break
        if moved / k < tol:
            break

    return labels, centers

# Example usage
if __name__ == "__main__":
    data = [
        [1.0, 2.0], [1.2, 1.9], [0.8, 2.2],
        [8.0, 9.0], [8.2, 8.8], [7.7, 9.3],
        [4.0, 4.0], [3.8, 4.2], [4.2, 3.9],
    ]
    labels, centers = kmeans(data, k=3, max_iters=100, tol=1e-4, init="kmeans++", seed=0)
    print("Labels:", labels)
    print("Centers:", centers)

In [5]:
import random
import math

# Euclidean distance
def dist(a, b):
    return (sum((x - y) ** 2 for x, y in zip(a, b)))**0.5

# Compute centroid of a cluster
def get_centroid(points):
    dim = len(points[0])
    centroid = []
    for d in range(dim):
        total = 0
        for p in points:
            total += p[d]
        avg = total / len(points)
        centroid.append(avg)
    return centroid #each dimension has an average


def kmeans(data, k, max_iters=100):
    # 1. Randomly pick k initial centers
    centers = random.sample(data, k)

    for _ in range(max_iters):
        # 2. Assign each point to nearest center
        clusters = [[] for _ in range(k)] #get 3 empty list
        for p in data:
            min_dist = float('inf')
            idx = -1
            for i in range(k):  # loop over all cluster indices
                d = dist(p, centers[i])
                if d < min_dist:
                    min_dist = d
                    idx = i
            clusters[idx].append(p)

        # 3. Update centers
        new_centers = []
        for c in clusters:
            if c:  # non-empty cluster
                new_centers.append(get_centroid(c))
            else:  # empty cluster, pick random point from data
                new_centers.append(random.choice(data))

        # 4. Check convergence
        if all(dist(centers[i], new_centers[i]) < 1e-4 for i in range(k)):
            break
        centers = new_centers

    # Final assignment
    labels = []
    for p in data:
        # idx = min(range(k), key=lambda c: dist(p, centers[c]))
        min_dist = float('inf')
        idx = -1
        for i in range(k):
            d = dist(p, centers[i])
            if d < min_dist:
                min_dist = d
                idx = i
        labels.append(idx)

    return labels, centers

# Example
data = [
    [1, 2], [1.2, 1.9], [0.8, 2.2],
    [8, 9], [8.2, 8.8], [7.7, 9.3],
    [4, 4], [3.8, 4.2], [4.2, 3.9]
]
labels, centers = kmeans(data, k=3)
print("Labels:", labels)
print("Centers:", centers)

Labels: [2, 2, 2, 1, 1, 1, 0, 0, 0]
Centers: [[4.0, 4.033333333333333], [7.966666666666666, 9.033333333333333], [1.0, 2.033333333333333]]
