In [58]:
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
from scipy.stats import ortho_group

In [59]:
data = pd.read_csv("/Users/lucastucker/REU-2023/archive/mnist_train.csv")
data = np.array(data)
np.random.shuffle(data)

In [60]:
data.shape

(60000, 785)

In [61]:
m = 2000 # number of MNIST images sampled
n = 784 # number of pixels per MNIST image
t = 50 # number of neighbors measured
sample = data[:m]
X = sample[:, 1:].T / 255
X.shape

(784, 2000)

In [62]:
def get_random_projection(k, X):
    R = np.random.normal(size = (k, n)) 
    frob_norm = np.linalg.norm(R.T.dot(R).dot(X) - X)
    print(f"frobenius norm for random pca is {frob_norm}")
    return R.dot(X)

In [63]:
def get_pca(k, X):
    mean_centered_data = X - np.mean(X, axis=1, keepdims=True)
    covariance_matrix = np.cov(mean_centered_data) 
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]
    proj = sorted_eigenvectors[:, :k] # top k minimize the frobenius norm
    frob_norm = np.linalg.norm(proj.dot(proj.T).dot(X) - X)
    print(f"frobenius norm for normal pca is {frob_norm}")
    reduced_data = np.dot(proj.T, mean_centered_data)
    return reduced_data

In [88]:
# This is performed according to the gradient calculated in the paper
def gradient_pca(k, X, tolerance, step):

    X = X - np.mean(X, axis=1, keepdims=True)
    XXT = X.dot(X.T)
    diff = math.inf
    proj = ortho_group.rvs(dim=n)[:, :k] # random n x k orthogonal matrix
    norm = np.linalg.norm(proj)
    proj = proj / norm
    frob_norm = math.inf

    while diff > tolerance:
        DF = -2 * XXT.dot(proj)
        proj_new = (proj - step * DF) # gradient descent step
        proj_new = proj_new / (np.linalg.norm(proj_new)) # keep normalized -- essential to prevent exploding gradient!
        diff = np.linalg.norm(proj - proj_new)
        proj = proj_new
        # print(f"diff is {diff}") # how far we have moved 
        # step = step * (1 + np.exp(-frob_norm / 500)) optional to improve performance
    frob_norm = np.linalg.norm(proj.dot(proj.T).dot(X) - X)
    print(f"frobenius norm is {frob_norm}") # the objective function
    reduced_data = np.dot(proj.T, X)
    return reduced_data

In [65]:
def nearest_t_nbrs(t, X):
    t_nearest = np.ones((m, t), dtype=int) * 1
    for id, row in enumerate(X.T):
        dif = X.T - row # get vector representation-wise differences
        norm_indices = np.argsort(np.linalg.norm(dif, axis = 1))
        t_nearest[id] = norm_indices[1: t + 1]
    return t_nearest # returns m x t matrix representing k_nearest

In [66]:
def t_similarity_score(t, X, X_reduced):
    pixelwise_t_nearest = nearest_t_nbrs(t, X)
    reduced_t_nearest = nearest_t_nbrs(t, X_reduced)
    shared_elems_list = []
    for row, row_tilde in zip(pixelwise_t_nearest, reduced_t_nearest):
        set_1 = set(row)
        set_2 = set(row_tilde)
        shared_elem_count = len(set_1.intersection(set_2))
        shared_elems_list.append(shared_elem_count)
    shared_elems = np.array(shared_elems_list) # row-wise intersection counts
    avg_shared = (1/m) * np.sum(shared_elems)
    return (avg_shared / t)

In [67]:
k = 50

In [68]:
X_pca = get_pca(k, X)

frobenius norm for normal pca is 135.33232068230524


In [69]:
X_random = get_random_projection(k, X)

frobenius norm for random pca is 80473.60524843165


In [70]:
t_similarity_score(t, X, X_random)

0.56561

In [90]:
tol = 0.001
step = 0.01
X_gradient_pca = gradient_pca(k, X, tol, step)

diff is 1.2199195109599732
diff is 0.4187713310779984
diff is 0.22937159487463063
diff is 0.15528108689135592
diff is 0.10481287525564816
diff is 0.07017700654901846
diff is 0.04706541456486305
diff is 0.03176185375786837
diff is 0.021587372017553102
diff is 0.014768419228317797
diff is 0.010159614477641729
diff is 0.007020966404123286
diff is 0.004869858328497382
diff is 0.0033878508357724118
diff is 0.0023624833854472556
diff is 0.0016506201556971487
diff is 0.0011550390849582807
diff is 0.0008092571721034223
frobenius norm is 308.81256983935054


In [72]:
t_similarity_score(t, X, X_gradient_pca)

0.10358