In [10]:
from multiprocessing import Pool
import os, tarfile, warnings
from tqdm import tqdm

def update_mean_and_sd(mean, M2, n, x):
    n += 1
    delta = x - mean
    mean += delta / n
    delta2 = x - mean
    M2 += delta * delta2
    sd = (M2 / (n - 1)) ** 0.5 if n > 1 else 0
    return mean, M2, sd, n

def count_papers_in_tar(tar_path):
    num_papers = 0
    try:
        with tarfile.open(tar_path, 'r') as tar:
            for member in tar.getmembers():
                if member.name.endswith('.gz'):
                    num_papers += 1
    except Exception as e:
        pass
        # warnings.warn(f"Failed to open {tar_path} due to error: {e}. Assuming no papers in this tar.")
    return num_papers, tar_path

def initializer(tqdm):
    import builtins
    builtins.tqdm = tqdm

def count_papers(root_dir):
    tar_paths = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for file in filenames:
            if file.endswith('.tar'):
                tar_paths.append(os.path.join(dirpath, file))

    total_papers = 0
    mean, m2, count = 0, 0, 0
    with Pool(initializer=initializer, initargs=(tqdm,)) as pool:
        results = list(tqdm(pool.imap(count_papers_in_tar, tar_paths), total=len(tar_paths)))
        # for num_papers, tar_path in pool.imap_unordered(count_papers_in_tar, tar_paths):
        #     mean, M2, sd, count = update_mean_and_sd(mean, m2, count, num_papers)
        #     total_papers += num_papers

    return total_papers

root_dir = os.path.expanduser('~/mounted_drives/arXiv/root/arXiv/src/')
print(f'Total number of papers: {count_papers(root_dir)}')

- method gz: ReadError('not a gzip file')
- method bz2: ReadError('not a bzip2 file')
- method xz: ReadError('not an lzma file')
- method tar: ReadError('invalid header'). Assuming no papers in this tar.
 25%|█████████████████████████████████████████████████▋                                                                                                                                                   | 1620/6427 [00:21<01:02, 76.49it/s]


KeyboardInterrupt: 

In [8]:
import numpy as np

def update_mean_and_sd(mean, M2, n, x):
    n += 1
    delta = x - mean
    mean += delta / n
    delta2 = x - mean
    M2 += delta * delta2
    sd = (M2 / (n - 1)) ** 0.5 if n > 1 else 0
    return mean, M2, sd, n

# Generate a random dataset
np.random.seed(0)
data = np.random.randn(1000)

# Initialize variables
mean, M2, n = 0, 0, 0

# Step through the data
for i, x in enumerate(data, 1):
    mean, M2, sd, n = update_mean_and_sd(mean, M2, n, x)

    # Validate results against numpy functions
    np_mean = np.mean(data[:i])
    if i > 1: 
        np_sd = np.std(data[:i], ddof=1)
        assert np.isclose(sd, np_sd), f'SDs do not match at i={i}: {sd} vs {np_sd}'

    assert np.isclose(mean, np_mean), f'Means do not match at i={i}: {mean} vs {np_mean}'

print('All means and standard deviations match.')


All means and standard deviations match.
