In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import struct

In [2]:
np.set_printoptions(threshold=60000)

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)

In [3]:
train_images_path = 'dataset/train-images.idx3-ubyte'
train_labels_path = 'dataset/train-labels.idx1-ubyte'
test_images_path = 'dataset/t10k-images.idx3-ubyte'
test_labels_path = 'dataset/t10k-labels.idx1-ubyte'

def load_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)

In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
X_train = load_idx(train_images_path).reshape(60000, 784)
y_train = load_idx(train_labels_path)
X_test = load_idx(test_images_path).reshape(10000, 784)
y_test = load_idx(test_labels_path)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


In [5]:
print('X_train shape:', X_train_pca.shape)
print('X_test shape:', X_test_pca.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (60000, 20)
X_test shape: (10000, 20)
y_train shape: (60000,)
y_test shape: (10000,)


In [6]:
X_train_pca

array([[ 123.93258788, -312.67424419,  -24.51406838, ...,  308.5890876 ,
         278.05128029,  163.27684841],
       [1011.71837533, -294.85701741,  596.33954532, ...,   39.57384325,
          52.96289747, -102.97723732],
       [ -51.84960873,  392.17315428, -188.50976325, ..., -122.37998602,
         -11.00721993, -435.3068495 ],
       ...,
       [-178.05344948,  160.07821073, -257.6130816 , ...,  -37.33350111,
         101.97751982,  240.76596341],
       [ 130.60607123,   -5.59193388,  513.85867477, ..., -141.95338899,
         -75.06672135,  196.34573267],
       [-173.43595135,  -24.71880663,  556.01890908, ...,  269.33091639,
          93.14246603, -208.41044729]])

In [7]:
def kmeans(X, k, max_iter=100, tol=1e-4):
    centroids = X[np.random.choice(len(X), k, replace=False)]
    
    for _ in range(max_iter):
        # calculate euclidean distances between each point and the centroids
        distances = np.sqrt(((X[:, np.newaxis] - centroids)**2).sum(axis=2))
        #Assign each point to the nearest centroid
        clusters = np.argmin(distances, axis=1)
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])
        
        # convergence check if centroids change less than tolerance
        if np.linalg.norm(new_centroids - centroids) < tol:
            break
        
        centroids = new_centroids
    
    return centroids, clusters

In [8]:
def cluster_consistency(labels, true_labels, k):
    Qs = []
    for i in range(k):
        cluster_points = true_labels[labels == i]
        mi = np.bincount(cluster_points).max()
        Ni = len(cluster_points)
        Qs.append(mi / Ni)
    return np.mean(Qs)

In [9]:
k_values = [5, 10, 20, 40]
results = {}

for k in k_values:
    centroids, clusters = kmeans(X_train_pca, k)
    # print(clusters)
    Q = cluster_consistency(clusters, y_train, k)
    results[k] = Q
    print(f"Cluster consistency for k={k}: {Q}")

Cluster consistency for k=5: 0.3868197667130647
Cluster consistency for k=10: 0.6008799260014361
Cluster consistency for k=20: 0.7229718345385645
Cluster consistency for k=40: 0.8181196634215381
