In [2]:
import hashlib
import time
import random
import math
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd
from IPython.display import display


In [3]:
class StandardBloomFilter:
    def __init__(self, n, fp_rate):
        self.size = self._get_size(n, fp_rate)
        self.hash_count = self._get_hash_count(self.size, n)
        self.bit_array = [0] * self.size

    def _hashes(self, item):
        return [hashlib.sha256(f"{item}{i}".encode()).hexdigest() for i in range(self.hash_count)]

    def _get_size(self, n, p):
        m = -(n * math.log(p)) / (math.log(2)**2)
        return int(m)

    def _get_hash_count(self, m, n):
        return int((m / n) * math.log(2))

    def add(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.bit_array[idx] = 1

    def query(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            if self.bit_array[idx] == 0:
                return False
        return True


In [4]:
class CountingBloomFilter:
    def __init__(self, n, fp_rate):
        self.size = self._get_size(n, fp_rate)
        self.hash_count = self._get_hash_count(self.size, n)
        self.count_array = [0] * self.size

    def _get_size(self, n, p):
        return int(-(n * math.log(p)) / (math.log(2)**2))

    def _get_hash_count(self, m, n):
        return int((m / n) * math.log(2))

    def add(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.count_array[idx] += 1

    def remove(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.count_array[idx] = max(0, self.count_array[idx] - 1)

    def query(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            if self.count_array[idx] == 0:
                return False
        return True


In [5]:

class NeuralNetworkBloomFilter:
    def __init__(self):
        self.model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=200)
        self.set_members = set()

    def _featurize(self, url):
        return np.array([ord(c) for c in url[:50]] + [0] * (50 - len(url))).reshape(1, -1)

    def train(self, positives, negatives):
        X = [self._featurize(x).flatten() for x in positives + negatives]
        y = [1]*len(positives) + [0]*len(negatives)
        self.model.fit(X, y)
        self.set_members = set(positives)

    def add(self, item): pass  # Not used

    def query(self, item):
        x = self._featurize(item)
        pred = self.model.predict(x)[0]
        if pred == 1:
            return True
        return False


In [16]:
class SandwichBloomFilter:
    def __init__(self, positives, negatives, fp_rate_small=0.20):
        self.nn_filter = NeuralNetworkBloomFilter()
        self.nn_filter.train(positives, negatives)

        self.small_filter = StandardBloomFilter(len(positives), fp_rate_small)
        for item in positives:
            self.small_filter.add(item)

    def add(self, item):  # not used
        pass

    def query(self, item):
        if not self.nn_filter.query(item):
            return False
        return self.small_filter.query(item)


In [6]:
def evaluate(bloom_filter, positives, negatives):
    # Insert positives
    for url in positives:
        bloom_filter.add(url)

    # Evaluate FPR and query time
    start = time.time()
    false_positives = 0
    for url in negatives:
        if bloom_filter.query(url) and url not in positives:
            false_positives += 1
    elapsed = time.time() - start
    fpr = false_positives / len(negatives)
    avg_query_time = elapsed / len(negatives)
    return fpr, avg_query_time

In [17]:
# 🌐 URL Dataset
print("\U0001f310 URL Dataset Bloom Filter Evaluation\n")
with open("../datasets/urls/url_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/urls/url_negatives.txt") as f:
    negatives = f.read().splitlines()

filters = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.20)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.20)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters[2][1].train(positives, negatives)

results_url = []
for name, bf in filters:
    fpr, t = evaluate(bf, positives, negatives)
    results_url.append((name, fpr, t))

display(pd.DataFrame(results_url, columns=["Filter Type", "False Positive Rate", "Average Query Time (s)"]))


🌐 URL Dataset Bloom Filter Evaluation



Unnamed: 0,Filter Type,False Positive Rate,Average Query Time (s)
0,Standard Bloom Filter,0.203775,8.5e-05
1,Counting Bloom Filter,0.203775,8.5e-05
2,Neural Network Bloom Filter,0.3727,0.000234
3,Sandwich Bloom Filter,0.063225,0.000102


In [9]:
# 🔐 Password Dataset
print("\U0001f510 Password Dataset Bloom Filter Evaluation\n")
with open("../datasets/passwords/password_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/passwords/password_negatives.txt") as f:
    negatives = f.read().splitlines()

filters_pw = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.0005)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.0005)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters_pw[2][1].train(positives, negatives)

results_pw = []
for name, bf in filters_pw:
    fpr, t = evaluate(bf, positives, negatives)
    results_pw.append((name, fpr, t))

display(pd.DataFrame(results_pw, columns=["Filter Type", "False Positive Rate", "Average Query Time (s)"]))


🔐 Password Dataset Bloom Filter Evaluation



Unnamed: 0,Filter Type,False Positive Rate,Average Query Time (s)
0,Standard Bloom Filter,0.0002,2e-06
1,Counting Bloom Filter,0.0002,2e-06
2,Neural Network Bloom Filter,0.0004,7.4e-05


In [None]:
# 📏 IP Address Dataset
print("\U0001f4cf IP Address Dataset Bloom Filter Evaluation\n")
with open("../datasets/ip_addresses/ip_address_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/ip_addresses/ip_addresses_negatives.txt") as f:
    negatives = f.read().splitlines()

filters_ip = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.01)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.01)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters_ip[2][1].train(positives, negatives)

results_ip = []
for name, bf in filters_ip:
    fpr, t = evaluate(bf, positives, negatives)
    results_ip.append((name, fpr, t))

display(pd.DataFrame(results_ip, columns=["Filter Type", "False Positive Rate", "Average Query Time (s)"]))


In [None]:
# 📧 Email Dataset
print("\U0001f4e7 Email Dataset Bloom Filter Evaluation\n")
with open("../datasets/emails/spam_email_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/emails/spam_email_negatives.txt") as f:
    negatives = f.read().splitlines()

filters_email = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.01)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.01)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters_email[2][1].train(positives, negatives)

results_email = []
for name, bf in filters_email:
    fpr, t = evaluate(bf, positives, negatives)
    results_email.append((name, fpr, t))

display(pd.DataFrame(results_email, columns=["Filter Type", "False Positive Rate", "Average Query Time (s)"]))


In [None]:
# 📞 Phone Number Dataset
print("\U0001f4de Phone Number Dataset Bloom Filter Evaluation\n")
with open("../datasets/phone_numbers/phone_numbers_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/phone_numbers/phone_numbers_negatives.txt") as f:
    negatives = f.read().splitlines()

filters_phone = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.01)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.01)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters_phone[2][1].train(positives, negatives)

results_phone = []
for name, bf in filters_phone:
    fpr, t = evaluate(bf, positives, negatives)
    results_phone.append((name, fpr, t))

display(pd.DataFrame(results_phone, columns=["Filter Type", "False Positive Rate", "Average Query Time (s)"]))
