In [1]:
import math
import random
import time
import mmh3
from collections import defaultdict

# Generate stream with variable frequency
def generate_stream(num_words=1000000):
    words = ["apple", "banana", "cherry", "date", "elderberry", "fig", "grape", "honeydew"]
    stream = []
    for _ in range(num_words):
        word = random.choices(words, weights=[50, 30, 10, 5, 3, 1, 1, 1])[0]
        stream.append(word)
    return stream

# Approximately Counting Bloom Filter Implementation
class ApproxCountingBloomFilter:
    def __init__(self, size, num_hashes):
        self.size = size
        self.num_hashes = num_hashes
        self.byte_array = [0] * size

    def add(self, item):
        min_count = float('inf')
        indices = []
        for i in range(self.num_hashes):
            index = mmh3.hash(item, i) % self.size
            indices.append(index)
            if self.byte_array[index] < min_count:
                min_count = self.byte_array[index]

        # Probabilistically increment the minimum value
        if random.random() <= 1 / (1.1 ** min_count):
            for index in indices:
                self.byte_array[index] = max(self.byte_array[index], min_count + 1)

    def estimate_count(self, item):
        min_count = float('inf')
        for i in range(self.num_hashes):
            index = mmh3.hash(item, i) % self.size
            if self.byte_array[index] < min_count:
                min_count = self.byte_array[index]
        return min_count

# Parameters for Approx Counting Bloom Filter
n = 1000000  # Number of elements to add
p = 0.01  # Desired false positive rate

# Calculating size of byte array (m) and number of hash functions (k)
m = - (n * math.log(p)) / (math.log(2) ** 2)
k = (m / n) * math.log(2)
m = int(m)
k = int(k)

# Generate the data stream
stream = generate_stream(n)

# Create Approximately Counting Bloom Filter
approx_bloom_filter = ApproxCountingBloomFilter(size=m, num_hashes=k)

# Adding elements to the Bloom Filter
start_time = time.time()
for word in stream:
    approx_bloom_filter.add(word)
end_time = time.time()
print(f"Time taken to add {n} elements to Approx Counting Bloom Filter: {end_time - start_time:.2f} seconds")

# Estimate the count of the next 100,000 elements
next_stream = generate_stream(100000)
counts = defaultdict(int)

start_time = time.time()
for word in next_stream:
    counts[word] = approx_bloom_filter.estimate_count(word)
end_time = time.time()

# Print estimated counts for some words
print("\nEstimated counts for some words:")
for word in ["apple", "banana", "cherry", "date"]:
    print(f"{word}: {counts[word]}")

print(f"Time taken to estimate counts for 100,000 elements: {end_time - start_time:.2f} seconds")


Time taken to add 1000000 elements to Approx Counting Bloom Filter: 0.87 seconds

Estimated counts for some words:
apple: 114
banana: 108
cherry: 95
date: 90
Time taken to estimate counts for 100,000 elements: 0.07 seconds
