In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import struct
from collections import Counter

In [25]:
train_images_path = 'dataset/train-images.idx3-ubyte'
train_labels_path = 'dataset/train-labels.idx1-ubyte'
test_images_path = 'dataset/t10k-images.idx3-ubyte'
test_labels_path = 'dataset/t10k-labels.idx1-ubyte'

def load_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)

In [26]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
X_train = load_idx(train_images_path).reshape(60000, 784)
y_train = load_idx(train_labels_path)
X_test = load_idx(test_images_path).reshape(10000, 784)
y_test = load_idx(test_labels_path)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_pca = sc.fit_transform(X_train_pca)
X_test_pca = sc.transform(X_test_pca)

In [27]:
print('X_train shape:', X_train_pca.shape)
print('X_test shape:', X_test_pca.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (60000, 20)
X_test shape: (10000, 20)
y_train shape: (60000,)
y_test shape: (10000,)


In [28]:
X_train_pca

array([[ 0.21485559, -0.63392676, -0.05330361, ...,  1.46802653,
         1.27204642,  0.83839742],
       [ 1.75396433, -0.59780343,  1.29668494, ...,  0.17272836,
         0.24901099, -0.51095848],
       [-0.08988901,  0.79510551, -0.40989693, ..., -0.58033766,
        -0.04847295, -2.17738855],
       ...,
       [-0.30868215,  0.32454814, -0.56015578, ..., -0.19089356,
         0.49895653,  1.20682925],
       [ 0.22642506, -0.01133728,  1.11733794, ..., -0.68586875,
        -0.36871049,  0.9867435 ],
       [-0.30067703, -0.05011574,  1.20901148, ...,  1.29312992,
         0.52952658, -1.03281044]])

In [29]:
def cGMM(X, K, max_iter=100, tol=1e-5):
    n_samples, n_features = X.shape
    pi = np.ones(K) / K 
    means = np.random.rand(K, n_features)
    variances = np.ones((K, n_features))
    responsibilities = np.zeros((n_samples, K))

    prev_log_likelihood = -np.inf  # track the log likelihood for convergence
    
    for iteration in range(max_iter):
        # e step, update responsibilities
        log_responsibilities = np.zeros((n_samples, K))
        
        for k in range(K):
            for j in range(n_features):
                log_responsibilities[:, k] += (
                    -0.5 * np.log(2 * np.pi * variances[k, j])
                    -0.5 * ((X[:, j] - means[k, j]) ** 2) / variances[k, j]
                )
        
        # Normalize responsibilities
        log_responsibilities = np.exp(log_responsibilities - np.max(log_responsibilities, axis=1, keepdims=True))
        responsibilities = log_responsibilities / log_responsibilities.sum(axis=1, keepdims=True)
        
        # m step, update parameters (means, variances, and pi)
        for k in range(K):
            Nk = responsibilities[:, k].sum()
            pi[k] = Nk / n_samples
            
            for j in range(n_features):
                means[k, j] = (responsibilities[:, k] * X[:, j]).sum() / Nk
                variances[k, j] = (responsibilities[:, k] * (X[:, j] - means[k, j])**2).sum() / Nk
        
        # log-likelihood for convergence check
        log_likelihood = np.sum(np.log(responsibilities.sum(axis=1)))
        
        if iteration > 1 and np.abs(log_likelihood - prev_log_likelihood) < tol:
            break
        
        prev_log_likelihood = log_likelihood 
    
    return means, variances, pi, responsibilities

In [30]:
K = 10
means, variances, pi, responsibilities = cGMM(X_train_pca, K)

clusters = np.argmax(responsibilities, axis=1)


def cluster_consistency_cgmm(clusters, labels, k):
    Q_total = 0
    valid_clusters = 0
    
    for cluster in range(k):
        cluster_points = labels[clusters == cluster]
        if len(cluster_points) == 0:
            continue
        
        label_counts = Counter(cluster_points)
        mi = max(label_counts.values()) 
        Ni = len(cluster_points) 
        
        if Ni > 0:
            Qi = mi / Ni
            Q_total += Qi
            valid_clusters += 1
    
    return Q_total / valid_clusters if valid_clusters > 0 else 0

Q_cgmm = cluster_consistency_cgmm(clusters, y_train, K)
print(f"Cluster consistency for cGMM with K={K}: {Q_cgmm}")

Cluster consistency for cGMM with K=10: 0.4347207636499542
