This problem was asked by Facebook.

Given a stream of elements too large to store in memory, pick a random element from the stream with uniform probability.

In [None]:
'''
google or wiki: reservoir sampling

this is a reservoir sampling of size k
'''

In [30]:
import random
from random import randrange



def reservoir_sampling(nums: list[int], sample_count: int) -> list[int]:
    # nums represents the list of data that's too big for the memory

    assert len(nums) > 0
    assert sample_count > 1
    res = [] * sample_count

    for idx, ele in enumerate(nums):
        if idx < sample_count:
            res.append(ele)
        else: 
            rdn = randrange(idx)
            if rdn < sample_count:
                res[rdn] = ele
    
    return res
        


In [21]:
3/8

0.375

In [36]:
'''
tests
'''
random.seed(42)
large_number = 100
reservoir_size = 10
test = [i for i in range(large_number)]

# run this a few number of times to get a distribution of reservoir size 10, so all numbers should show up about 10% of the time
counts = [0] * large_number
nums_sims = 10000
for i in range(nums_sims):
    cur_run = reservoir_sampling(test, reservoir_size)
    assert len(cur_run) == reservoir_size
    for j in cur_run:
        counts[j] += 1


test_tolerance = 0.5
target_dist = reservoir_size / large_number
final_dist = [e / nums_sims for e in counts]
assert all(target_dist * (1 - test_tolerance) < ele <= target_dist * (1 + test_tolerance) for ele in final_dist), f"final_distribution: {final_dist}"
final_dist

[0.0861,
 0.0935,
 0.0928,
 0.0917,
 0.0899,
 0.0936,
 0.0916,
 0.0936,
 0.0962,
 0.0839,
 0.0956,
 0.1043,
 0.1054,
 0.0984,
 0.0963,
 0.102,
 0.0965,
 0.098,
 0.1028,
 0.0976,
 0.0978,
 0.1,
 0.1045,
 0.1043,
 0.1018,
 0.1028,
 0.0964,
 0.1019,
 0.0969,
 0.098,
 0.103,
 0.0959,
 0.1037,
 0.1028,
 0.1061,
 0.0997,
 0.1016,
 0.1014,
 0.1027,
 0.1012,
 0.101,
 0.1033,
 0.1053,
 0.1007,
 0.0971,
 0.1061,
 0.1014,
 0.106,
 0.1029,
 0.1002,
 0.1021,
 0.1028,
 0.098,
 0.0973,
 0.1046,
 0.103,
 0.1024,
 0.0973,
 0.1014,
 0.0971,
 0.1023,
 0.101,
 0.1043,
 0.1033,
 0.1018,
 0.1009,
 0.0939,
 0.0945,
 0.0974,
 0.1059,
 0.097,
 0.0998,
 0.1,
 0.1017,
 0.1034,
 0.0986,
 0.1047,
 0.1006,
 0.1038,
 0.1,
 0.1015,
 0.0977,
 0.1006,
 0.1014,
 0.1022,
 0.0957,
 0.1031,
 0.0976,
 0.103,
 0.1045,
 0.1036,
 0.1014,
 0.1018,
 0.1,
 0.0976,
 0.1039,
 0.0987,
 0.0985,
 0.1052,
 0.1048]