In [9]:
import os 
from tqdm import tqdm
import sparse
import numpy as np

In [2]:
names = []
for root, dirs, files in os.walk("data/Task2", topdown=False):
   for name in files:
      names.append(os.path.join(root, name))

In [5]:
print(names[0])
print(len(names))

data/Task2/Quark/13211.npz
36272


Golden for small : 
-  mean = np.array([0.00782386,0.00482207,0.00304506])
- std = np.array([0.63704916,0.18686025,0.04598732])


### Calculate Mean ( small memory footprint)

In [10]:
counts = 0
means = 0
for f in tqdm(names):
    mat = sparse.load_npz(f).todense()

    v=  mat.shape[0] * mat.shape[1]
    counts += np.array([v, v, v])
    means += np.sum(mat.reshape(-1, 3), axis=0)

means = means / counts
print(means)

100%|██████████| 36272/36272 [00:44<00:00, 812.13it/s]

[0.00782386 0.00482207 0.00304506]





### Calculate std (small memory footprint)

$$ \sqrt{\frac{ \sum_{i}^{N} (x_{i} - \bar{x})^{2} }{N}} $$

In [12]:
stds = 0
for f in tqdm(names):
    mat = sparse.load_npz(f).todense()
    stds += np.sum( (mat.reshape(-1, 3) - means)**2, axis=0)

print( np.sqrt( stds / counts ) )

100%|██████████| 36272/36272 [00:48<00:00, 742.12it/s]

[0.63704916 0.18686025 0.04598732]





### Final

In [14]:
def calculate_mean_std_2pass_low_memory(directory):
    names = []
    for root, dirs, files in os.walk(directory, topdown=False):
        for name in files:
            names.append(os.path.join(root, name))

    counts = 0
    means = 0
    for f in tqdm(names):
        mat = sparse.load_npz(f).todense()

        v=  mat.shape[0] * mat.shape[1]
        counts += np.array([v, v, v])
        means += np.sum(mat.reshape(-1, 3), axis=0)
    means = means / counts

    stds = 0
    for f in tqdm(names):
        mat = sparse.load_npz(f).todense()
        stds += np.sum( (mat.reshape(-1, 3) - means)**2, axis=0)

    stds = np.sqrt( stds / counts ) 

    return means, stds

In [15]:
calculate_mean_std_2pass_low_memory("data/Task2_small")

100%|██████████| 36272/36272 [00:42<00:00, 844.71it/s]
100%|██████████| 36272/36272 [00:51<00:00, 705.82it/s]


(array([0.00782386, 0.00482207, 0.00304506]),
 array([0.63704916, 0.18686025, 0.04598732]))

In [16]:
calculate_mean_std_2pass_low_memory("data/Task2_large")

100%|██████████| 139306/139306 [02:52<00:00, 805.96it/s]
100%|██████████| 139306/139306 [03:11<00:00, 728.10it/s]


(array([0.00793777, 0.00491434, 0.00303773]),
 array([1.89825445, 0.18734633, 0.04594031]))