In [19]:
import numpy as np
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [15]:
class K_means:
    def __init__(self, k, init, max_itrns=100, tolerance=1e-4, random_state=None):
        self.k = k
        self.init = init
        self.max_itrns = max_itrns
        self.tolerance = tolerance
        self.random_state = random_state
        self.centroids = None
        self.labels = None
        self.inertia_history = []
    
    def _init_random(self, x):
        num_of_samples = x.shape[0] # rows
        indices = np.random.choice(num_of_samples, size=self.k, replace=False)
        return  x[indices]
    
    def _init_Kpp(self, x):
        num_of_samples = x.shape[0] # rows
        index = np.random.randint(num_of_samples) # only one sample
        centroids = []
        centroids.append(x[index])

        for _ in (1, self.k):
            dists_sq = np.min(
                    [np.sum((x - c) ** 2, axis=1) for c in centroids],
                    axis=0
                ) # for each point in x:
                # compute distance to every centroid
                # keep the smallest one
        probs = dists_sq / np.sum(dists_sq)
        index = np.random.choice(num_of_samples, p=probs) 
        centroids.append(x[index])

        return np.array(centroids)

    def assign_clusters(self, x):
        distances = np.linalg.norm(x[:, None] - self.centroids, axis=2)
        return np.argmin(distances, axis=1)

    def compute_centroids(self, x):
        new_centroids = np.array([
            x[self.labels == i].mean(axis=0) if np.any(self.labels == i)
            else self.centroids[i]  # handle empty cluster
            for i in range(self.k)
        ])
        return new_centroids

    def compute_inertia(self, x):
        return np.sum((x - self.centroids[self.labels]) ** 2)
    
    def fit(self, x):
        if self.random_state is not None:
            np.random.seed(self.random_state)

        if self.init == "random":
            self.centroids = self._init_random(x)
        elif self.init == "kmeans++":
            self.centroids = self._init_Kpp(x)
        else:
            raise ValueError("init must be 'random' or 'kmeans++'")

        for _ in range(self.max_itrns):
            self.labels = self.assign_clusters(x)
            inertia = self.compute_inertia(x)
            self.inertia_history.append(inertia)

            new_centroids = self.compute_centroids(x)

            # tolerance-based convergence
            shift = np.linalg.norm(self.centroids - new_centroids)
            if shift < self.tolerance:
                break

            self.centroids = new_centroids
            return self

    def predict(self, x):
        return self.assign_clusters(x)

In [2]:
df = kagglehub.dataset_load(KaggleDatasetAdapter.PANDAS,
                            "uciml/breast-cancer-wisconsin-data",
                            "data.csv"
    )

print(df.shape)

(569, 33)


In [5]:
y = df["diagnosis"].map({"M": 1, "B": 0}).values

# drop non-feature columns
X = df.drop(columns=["id", "diagnosis", "Unnamed: 32"], errors="ignore").values

# standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

print(X.shape)  

(569, 30)


In [16]:
k = 2

kmeans_rand = K_means(k=k, init="random", random_state=42)
kmeans_rand.fit(X)

kmeans_pp = K_means(k=k, init="kmeans++", random_state=42)
kmeans_pp.fit(X)

print("Random inertia:", kmeans_rand.inertia_history)
print("KMeans++ inertia:", kmeans_pp.inertia_history)

Random inertia: [np.float64(14858.8699757902)]
KMeans++ inertia: [np.float64(16420.447280984325)]
