In [1]:
from typing import List, Any, Tuple, Callable, Union
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import math
import pandas as pd

In [2]:
def sample_once() -> np.ndarray:
    """
    Mock query function that returns a vector for debugging composed of random integers
    sampled from different ranges.
    """
    a = int(np.random.normal(0, 1) * 10)
    b = int(np.random.normal(0, 1) * 10)
    c = int(np.random.normal(0, 1) * 50)

    # make the ideal basis rotated by 45 degrees

    # mat = np.array([a, b, c])
    #proj = np.array([[3, 1, 1], [-1, 2, 1], [-1/2, -2, 7/2]])

    rot_45_x = np.array([[1, 0, 0], [0, math.cos(math.pi/4), -math.sin(math.pi/4)], [0, math.sin(math.pi/4), math.cos(math.pi/4)]])
    a, b, c = np.dot(rot_45_x, np.array([a, b, c]))
    
    # a, b, c = np.dot(proj, np.array([a, b, c]))
    # a, b, c = np.dot(np.linalg.inv(proj), np.array([a, b, c]))

    return np.array([a, b, c])

sample_once()

array([-13.        ,  -8.48528137,   7.07106781])

In [3]:
results = []

In [4]:
for sample_size in [10, 100, 1000, 10000, 100000, 1000000]:
    for trial in range(50):
        data = np.array([sample_once() for _ in range(sample_size)])

        y_cov: np.ndarray = np.atleast_2d(np.cov(data.T))
        u, eigs, u_t = np.linalg.svd(y_cov)
        
        # project the data onto the basis vectors
        data2 = u @ data.T

        # Measure how good the basis vectors are at capturing the data
        # measure variance of the projected data
        # old_var = np.var(data, axis=0)
        # print(old_var, sum(old_var))
        var = [np.var(data2[i]) for i in range(len(data2))]
        # print(var, sum(var))

        eig_sum = np.sum(eigs)

        eig_sqrt_sum = np.sum([math.sqrt(eig) for eig in eigs])

        results.append([sample_size, var, eig_sum, eig_sqrt_sum])

        # in the projected space, we use the basis vectors as approximations of the data
        # we could measure the error of the approximation by measuring the variance of the residuals
        # residuals are measured against the axes of the projected space, so they are equivalent to magnitude in each direction
        # print(np.mean([a**2 for a in np.var(data, axis=0)]))
        # mse = np.mean([a**2 for a in np.var(data2, axis=0)])
        # print(mse)

results

[[10,
  [101.14787996975527, 2132.829979742635, 1483.6721402876092],
  4130.7222222222235,
  81.07031594556545],
 [10,
  [162.6613061171173, 605.1726733279261, 816.4260205549573],
  1760.2888888888886,
  57.29145244119595],
 [10,
  [2146.6710578760944, 473.81846181716656, 202.89048030673928],
  3137.088888888888,
  70.22599858454831],
 [10,
  [181.03220229737795, 965.3640561240275, 549.0237415785949],
  1883.7999999999993,
  55.559303004269175],
 [10,
  [92.52296623890881, 774.3265265789404, 364.5705071821507],
  1368.2444444444443,
  54.32034555125553],
 [10,
  [1434.5566899486435, 60.93181844837595, 390.16149160298085],
  2095.1666666666665,
  60.26818555823485],
 [10,
  [1332.7264049146008, 382.8982163824759, 324.6353787029225],
  2266.9555555555553,
  67.58967470444921],
 [10,
  [114.8890391092257, 1174.9293181075368, 1054.6016427832367],
  2604.911111111111,
  63.67849599348671],
 [10,
  [119.06930290567111, 1590.8782990302057, 1885.7923980641249],
  3995.266666666665,
  81.761890

In [5]:
df = pd.DataFrame(results, columns=["Sample Size", "Variance", "Sum of Eigenvalues", "Sum of Square Root of Eigenvalues"])
df.to_csv("svd_results.csv")
# write pickle
df.to_pickle("svd_results.pkl")