In [3]:
!pip install mmh3
!pip install bitarray

import math
import random
import time
import mmh3
from bitarray import bitarray

# Generate stream with variable frequency
def generate_stream(num_words=1000000):
    words = ["apple", "banana", "cherry", "date", "elderberry", "fig", "grape", "honeydew"]
    stream = []
    for _ in range(num_words):
        word = random.choices(words, weights=[50, 30, 10, 5, 3, 1, 1, 1])[0]
        stream.append(word)
    return stream

# Bloom Filter Implementation
class BloomFilter:
    def __init__(self, size, num_hashes):
        self.size = size
        self.num_hashes = num_hashes
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)

    def add(self, item):
        for i in range(self.num_hashes):
            index = mmh3.hash(item, i) % self.size
            self.bit_array[index] = 1

    def check(self, item):
        for i in range(self.num_hashes):
            index = mmh3.hash(item, i) % self.size
            if not self.bit_array[index]:
                return False
        return True

# Parameters for Bloom Filter
n = 1000000  # Number of elements to add
p = 0.01  # Desired false positive rate

# Calculating size of bit array (m) and number of hash functions (k)
m = - (n * math.log(p)) / (math.log(2) ** 2)
k = (m / n) * math.log(2)
m = int(m)
k = int(k)

# Generate the data stream
stream = generate_stream(n)

# Create Bloom Filter
bloom_filter = BloomFilter(size=m, num_hashes=k)

# Adding elements to the Bloom Filter
start_time = time.time()
for word in stream:
    bloom_filter.add(word)
end_time = time.time()
print(f"Time taken to add {n} elements to Bloom Filter: {end_time - start_time:.2f} seconds")

# Check false positive rate with next 100,000 elements
next_stream = generate_stream(100000)
false_positives = 0
true_negatives = 0

start_time = time.time()
for word in next_stream:
    if bloom_filter.check(word):
        false_positives += 1
    else:
        true_negatives += 1
end_time = time.time()

false_positive_rate = (false_positives / (false_positives + true_negatives)) * 100
print(f"False positive rate after checking 100,000 elements: {false_positive_rate:.2f}%")
print(f"Time taken to check 100,000 elements: {end_time - start_time:.2f} seconds")


Collecting bitarray
  Downloading bitarray-3.0.0-cp312-cp312-win_amd64.whl.metadata (33 kB)
Downloading bitarray-3.0.0-cp312-cp312-win_amd64.whl (121 kB)
Installing collected packages: bitarray
Successfully installed bitarray-3.0.0
Time taken to add 1000000 elements to Bloom Filter: 0.58 seconds
False positive rate after checking 100,000 elements: 100.00%
Time taken to check 100,000 elements: 0.07 seconds
