In [1]:
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv("/Users/lucastucker/REU-2023/archive/mnist_train.csv")
data = np.array(data)
np.random.shuffle(data)

In [3]:
data.shape

(60000, 785)

In [4]:
m = 2000 # number of MNIST images sampled
n = 784 # number of pixels per MNIST image
t = 10 # number of neighbors measured
sample = data[:m]
X = sample[:, 1:].T / 255
X.shape

(784, 2000)

In [5]:
def get_random_projection(k, X):
    R = np.random.normal(size = (k, n)) 
    frob_norm = np.linalg.norm(R.T.dot(R).dot(X) - X)
    print(f"frobenius norm for random pca is {frob_norm}")
    return R.dot(X)

In [6]:
def get_pca(k, X):
    mean_centered_data = X - np.mean(X, axis=1, keepdims=True)
    covariance_matrix = np.cov(mean_centered_data) 
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]
    proj = sorted_eigenvectors[:, :k] # top k minimize the frobenius norm
    frob_norm = np.linalg.norm(proj.dot(proj.T).dot(X) - X)
    print(f"frobenius norm for normal pca is {frob_norm}")
    reduced_data = np.dot(proj.T, mean_centered_data)
    return reduced_data

In [52]:
def gradient_pca(k, X, tolerance, step):
    X = X - np.mean(X, axis=1, keepdims=True)
    XXT = X.dot(X.T)
    diff = math.inf
    U_k = np.random.normal(size = (n, k))
    frob_norm = math.inf
    while diff > tolerance:
        DF = -2 * XXT.dot(U_k)
        U_k_new = U_k + step * DF # shouldn't this subtract?
        diff = np.linalg.norm(U_k - U_k_new)
        U_k = U_k_new
        # print(f"diff is {diff}") # how far we have moved 
        step = step * (1 + np.exp(-frob_norm / 500))
    frob_norm = np.linalg.norm(U_k.dot(U_k.T).dot(X) - X)
    print(f"frobenius norm is {frob_norm}") # the objective function
    return U_k.T.dot(X)

In [20]:
def nearest_t_nbrs(t, X):
    t_nearest = np.ones((m, t), dtype=int) * 1
    for id, row in enumerate(X.T):
        dif = X.T - row # get vector representation-wise differences
        norm_indices = np.argsort(np.linalg.norm(dif, axis = 1))
        t_nearest[id] = norm_indices[1: t + 1]
    return t_nearest # returns m x t matrix representing k_nearest

In [21]:
def t_similarity_score(t, X, X_reduced):
    pixelwise_t_nearest = nearest_t_nbrs(t, X)
    reduced_t_nearest = nearest_t_nbrs(t, X_reduced)
    shared_elems_list = []
    for row, row_tilde in zip(pixelwise_t_nearest , reduced_t_nearest):
        set_1 = set(row)
        set_2 = set(row_tilde)
        shared_elem_count = len(set_1.intersection(set_2))
        shared_elems_list.append(shared_elem_count)
    shared_elems = np.array(shared_elems_list) # row-wise intersection counts
    avg_shared = (1/m) * np.sum(shared_elems)
    return (avg_shared / t)

In [22]:
k = 50

In [23]:
X_pca = get_pca(k, X)

frobenius norm for normal pca is 135.4278813950677


In [24]:
X_random = get_random_projection(k, X)

frobenius norm for random pca is 87576.69283273093


In [25]:
t_similarity_score(k, X, X_random)

0.59913

In [49]:
tol = 0.001
step = 0.00001
X_fast_pca = gradient_pca(k, X, tol, step)

diff is 2.717616131130294
diff is 2.34485603040219
diff is 2.0469865490138375
diff is 1.8076621759283864
diff is 1.6140787131300753
diff is 1.456249722667441
diff is 1.3264207765406661
diff is 1.2185932314866181
diff is 1.1281381944886288
diff is 1.0514864747747898
diff is 0.9858827957060795
diff is 0.9291937006206101
diff is 0.8797594042388127
diff is 0.8362807836080063
diff is 0.7977338572944034
diff is 0.7633053713081003
diff is 0.7323443594253406
diff is 0.704325671613937
diff is 0.6788224149562685
diff is 0.6554850159909241
diff is 0.6340252064972661
diff is 0.6142036829504657
diff is 0.5958205223397258
diff is 0.578707680614881
diff is 0.5627230771772678
diff is 0.5477458972878182
diff is 0.533672837471085
diff is 0.5204150868676207
diff is 0.5078958872109465
diff is 0.49604855081734794
diff is 0.48481484332524644
diff is 0.47414365848952805
diff is 0.46398992795551774
diff is 0.4543137209092539
diff is 0.44507949775744754
diff is 0.43625548920463636
diff is 0.42781317775718936
d

In [51]:
t_similarity_score(t, X, X_fast_pca)

0.026500000000000003