In [22]:
import numpy as np
from tensorflow.keras.datasets import mnist
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load data
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Filter for specific digits
digits_of_interest = [2, 3, 8, 9]
mask = np.isin(y_train, digits_of_interest)
x_train, y_train = x_train[mask], y_train[mask]
mask = np.isin(y_test, digits_of_interest)
x_test, y_test = x_test[mask], y_test[mask]

# Flatten and normalize
x_train = x_train.reshape(-1, 784).astype(np.float64)
x_test = x_test.reshape(-1, 784).astype(np.float64)
x_train /= 255.0
x_test /= 255.0

# Standardize the data
scaler = StandardScaler()
x_train_standardized = scaler.fit_transform(x_train)
x_test_standardized = scaler.transform(x_test)

# Apply PCA
n_components = 50
pca = PCA(n_components=n_components)
x_train_pca = pca.fit_transform(x_train_standardized)
x_test_pca = pca.transform(x_test_standardized)

In [23]:
def initialize_centroids(X, k):
    np.random.seed(42)  # For reproducibility
    indices = np.random.choice(X.shape[0], k, replace=False)
    return X[indices]

def compute_distances(X, centroids):
    distances = np.zeros((X.shape[0], centroids.shape[0]))
    for i, centroid in enumerate(centroids):
        distances[:, i] = np.linalg.norm(X - centroid, axis=1)
    return distances

def assign_clusters(distances):
    return np.argmin(distances, axis=1)

def update_centroids(X, labels, k):
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        if len(X[labels == i]) == 0:
            centroids[i] = initialize_centroids(X, 1)  # Reinitialize empty cluster
        else:
            centroids[i] = X[labels == i].mean(axis=0)
    return centroids

def kmeans(X, k, max_iters=100, tol=1e-4):
    centroids = initialize_centroids(X, k)
    for i in range(max_iters):
        old_centroids = centroids
        distances = compute_distances(X, centroids)
        labels = assign_clusters(distances)
        centroids = update_centroids(X, labels, k)
        if np.linalg.norm(centroids - old_centroids) < tol:
            break
    return labels, centroids

# Find 4 clusters using flattened images
k = 4
labels, centroids = kmeans(x_train, k)

# Calculate SSE (Sum of Squared Errors)
def compute_sse(X, labels, centroids):
    return np.sum((X - centroids[labels]) ** 2)

sse = compute_sse(x_train, labels, centroids)
print(f"SSE (flattened images): {sse}")


SSE (flattened images): 1042456.69919748


In [24]:
# Find 4 clusters using PCA features
labels_pca, centroids_pca = kmeans(x_train_pca, k)
sse_pca = compute_sse(x_train_pca, labels_pca, centroids_pca)
print(f"SSE (PCA features): {sse_pca}")

SSE (PCA features): 7917818.728368093


In [25]:
from sklearn.metrics import adjusted_rand_score

# Calculate clustering accuracy using ARI
accuracy_flat = adjusted_rand_score(y_train, labels)
accuracy_pca = adjusted_rand_score(y_train, labels_pca)
print(f"Clustering accuracy (flattened images): {accuracy_flat}")
print(f"Clustering accuracy (PCA features): {accuracy_pca}")


Clustering accuracy (flattened images): 0.4921245479418843
Clustering accuracy (PCA features): 0.4170084635128988


In [26]:
from sklearn.metrics.pairwise import cosine_distances

def compute_cosine_distances(X, centroids):
    return cosine_distances(X, centroids)

def kmeans_cosine(X, k, max_iters=100, tol=1e-4):
    centroids = initialize_centroids(X, k)
    for i in range(max_iters):
        old_centroids = centroids
        distances = compute_cosine_distances(X, centroids)
        labels = assign_clusters(distances)
        centroids = update_centroids(X, labels, k)
        if np.linalg.norm(centroids - old_centroids) < tol:
            break
    return labels, centroids

# Find 4 clusters using flattened images with cosine similarity
labels_cosine, centroids_cosine = kmeans_cosine(x_train, k)
sse_cosine = compute_sse(x_train, labels_cosine, centroids_cosine)
print(f"SSE (flattened images, cosine): {sse_cosine}")

# Find 4 clusters using PCA features with cosine similarity
labels_pca_cosine, centroids_pca_cosine = kmeans_cosine(x_train_pca, k)
sse_pca_cosine = compute_sse(x_train_pca, labels_pca_cosine, centroids_pca_cosine)
print(f"SSE (PCA features, cosine): {sse_pca_cosine}")

# Calculate clustering accuracy using ARI
accuracy_flat_cosine = adjusted_rand_score(y_train, labels_cosine)
accuracy_pca_cosine = adjusted_rand_score(y_train, labels_pca_cosine)
print(f"Clustering accuracy (flattened images, cosine): {accuracy_flat_cosine}")
print(f"Clustering accuracy (PCA features, cosine): {accuracy_pca_cosine}")

SSE (flattened images, cosine): 1043827.4055159354
SSE (PCA features, cosine): 7926491.812979403
Clustering accuracy (flattened images, cosine): 0.45591412906393075
Clustering accuracy (PCA features, cosine): 0.4822316874986062
