In [18]:
import hashlib
import time
import random
import math
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd
from IPython.display import display
import sys


In [19]:
class StandardBloomFilter:
    def __init__(self, n, fp_rate):
        self.size = self._get_size(n, fp_rate)
        self.hash_count = self._get_hash_count(self.size, n)
        self.bit_array = [0] * self.size

    def _hashes(self, item):
        return [hashlib.sha256(f"{item}{i}".encode()).hexdigest() for i in range(self.hash_count)]

    def _get_size(self, n, p):
        m = -(n * math.log(p)) / (math.log(2)**2)
        return int(m)

    def _get_hash_count(self, m, n):
        return int((m / n) * math.log(2))

    def add(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.bit_array[idx] = 1

    def query(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            if self.bit_array[idx] == 0:
                return False
        return True


In [20]:
class CountingBloomFilter:
    def __init__(self, n, fp_rate):
        self.size = self._get_size(n, fp_rate)
        self.hash_count = self._get_hash_count(self.size, n)
        self.count_array = [0] * self.size

    def _get_size(self, n, p):
        return int(-(n * math.log(p)) / (math.log(2)**2))

    def _get_hash_count(self, m, n):
        return int((m / n) * math.log(2))

    def add(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.count_array[idx] += 1

    def remove(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.count_array[idx] = max(0, self.count_array[idx] - 1)

    def query(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            if self.count_array[idx] == 0:
                return False
        return True


In [21]:

class NeuralNetworkBloomFilter:
    def __init__(self):
        self.model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=200)
        self.set_members = set()

    def _featurize(self, url):
        return np.array([ord(c) for c in url[:50]] + [0] * (50 - len(url))).reshape(1, -1)

    def train(self, positives, negatives):
        X = [self._featurize(x).flatten() for x in positives + negatives]
        y = [1]*len(positives) + [0]*len(negatives)
        self.model.fit(X, y)
        self.set_members = set(positives)

    def add(self, item): pass  # Not used

    def query(self, item):
        x = self._featurize(item)
        pred = self.model.predict(x)[0]
        if pred == 1:
            return True
        return False


In [22]:
class SandwichBloomFilter:
    def __init__(self, positives, negatives, fp_rate_small=0.20):
        self.nn_filter = NeuralNetworkBloomFilter()
        self.nn_filter.train(positives, negatives)

        self.small_filter = StandardBloomFilter(len(positives), fp_rate_small)
        for item in positives:
            self.small_filter.add(item)

    def add(self, item):  # not used
        pass

    def query(self, item):
        if not self.nn_filter.query(item):
            return False
        return self.small_filter.query(item)


In [23]:
def get_memory_usage(bf):
    """Estimate memory used by each filter."""
    size = sys.getsizeof(bf)
    if hasattr(bf, 'bit_array'):
        size += sys.getsizeof(bf.bit_array)
    if hasattr(bf, 'count_array'):
        size += sys.getsizeof(bf.count_array)
    if isinstance(bf, NeuralNetworkBloomFilter):
        # account for learned parameters
        for coef in bf.model.coefs_:
            size += coef.nbytes
        for intercept in bf.model.intercepts_:
            size += intercept.nbytes
    if hasattr(bf, 'nn_filter') and hasattr(bf, 'small_filter'):
        # Sandwich filter: sum its two parts
        size = get_memory_usage(bf.nn_filter) + get_memory_usage(bf.small_filter)
    return size

In [24]:
def evaluate(bf, positives, negatives):
    """Run insertions and then measure FP, FN (for NN), timing, throughput, memory."""
    # — insert positives —
    for url in positives:
        bf.add(url)

    # — measure false positives —
    start = time.time()
    false_positives = sum(1 for url in negatives if bf.query(url) and url not in positives)
    elapsed = time.time() - start

    # — measure false negatives (only makes sense for NN-based filters) —
    false_negatives = 0
    if isinstance(bf, NeuralNetworkBloomFilter):
        for url in positives:
            if not bf.query(url):
                false_negatives += 1
    fnr = false_negatives / len(positives) if positives else 0.0

    # — compute metrics —
    fpr = false_positives / len(negatives)
    avg_query_time = elapsed / len(negatives)
    throughput = len(negatives) / elapsed if elapsed > 0 else float('inf')
    mem_bytes = get_memory_usage(bf)

    # — print a summary —
    print(f"\n=== {bf.__class__.__name__} ===")
    print(f"Memory Usage:             {mem_bytes:,} bytes")
    print(f"False‐Positive Rate:      {fpr:.4%}")
    if isinstance(bf, NeuralNetworkBloomFilter):
        print(f"False‐Negative Rate:      {fnr:.4%}")
    print(f"Avg Query Time:           {avg_query_time:.6f} s")
    print(f"Throughput:               {throughput:,.0f} queries/s")

    return {
        'fpr': fpr,
        'fnr': fnr if isinstance(bf, NeuralNetworkBloomFilter) else None,
        'avg_time': avg_query_time,
        'throughput': throughput,
        'memory_bytes': mem_bytes
    }

In [25]:
# 🌐 URL Dataset
print("\U0001f310 URL Dataset Bloom Filter Evaluation\n")
with open("../datasets/urls/url_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/urls/url_negatives.txt") as f:
    negatives = f.read().splitlines()

filters = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.20)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.20)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters[2][1].train(positives, negatives)

results_url = []
for name, bf in filters:
    m = evaluate(bf, positives, negatives)
    results_url.append({
        "Filter Type":      name,
        "Memory (bytes)":   m["memory_bytes"],
        "False Pos Rate":   m["fpr"],
        "False Neg Rate":   m["fnr"],            
        "Avg Query Time (s)": m["avg_time"],
        "Throughput (q/s)": m["throughput"],
    })

df_url = pd.DataFrame(results_url)
display(df_url)

🌐 URL Dataset Bloom Filter Evaluation


=== StandardBloomFilter ===
Memory Usage:             1,194,416 bytes
False‐Positive Rate:      20.3775%
Avg Query Time:           0.000085 s
Throughput:               11,711 queries/s

=== CountingBloomFilter ===
Memory Usage:             1,194,416 bytes
False‐Positive Rate:      20.3775%
Avg Query Time:           0.000085 s
Throughput:               11,713 queries/s

=== NeuralNetworkBloomFilter ===
Memory Usage:             4,224 bytes
False‐Positive Rate:      22.5250%
False‐Negative Rate:      27.7947%
Avg Query Time:           0.000171 s
Throughput:               5,850 queries/s

=== SandwichBloomFilter ===
Memory Usage:             1,198,640 bytes
False‐Positive Rate:      4.4975%
Avg Query Time:           0.000096 s
Throughput:               10,401 queries/s


Unnamed: 0,Filter Type,Memory (bytes),False Pos Rate,False Neg Rate,Avg Query Time (s),Throughput (q/s)
0,Standard Bloom Filter,1194416,0.203775,,8.5e-05,11710.698311
1,Counting Bloom Filter,1194416,0.203775,,8.5e-05,11712.547609
2,Neural Network Bloom Filter,4224,0.22525,0.277947,0.000171,5850.059708
3,Sandwich Bloom Filter,1198640,0.044975,,9.6e-05,10400.780424


In [27]:
# 🔐 Password Dataset
print("\U0001f510 Password Dataset Bloom Filter Evaluation\n")
with open("../datasets/passwords/password_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/passwords/password_negatives.txt") as f:
    negatives = f.read().splitlines()

filters_pw = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.0005)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.0005)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters_pw[2][1].train(positives, negatives)

results_pw = []
for name, bf in filters_pw:
    m = evaluate(bf, positives, negatives)
    results_pw.append({
        "Filter Type":      name,
        "Memory (bytes)":   m["memory_bytes"],
        "False Pos Rate":   m["fpr"],
        "False Neg Rate":   m["fnr"],
        "Avg Query Time (s)": m["avg_time"],
        "Throughput (q/s)": m["throughput"],
    })

df_pw = pd.DataFrame(results_pw)
display(df_pw)


🔐 Password Dataset Bloom Filter Evaluation


=== StandardBloomFilter ===
Memory Usage:             1,265,728 bytes
False‐Positive Rate:      0.0200%
Avg Query Time:           0.000002 s
Throughput:               458,194 queries/s

=== CountingBloomFilter ===
Memory Usage:             1,265,728 bytes
False‐Positive Rate:      0.0200%
Avg Query Time:           0.000002 s
Throughput:               460,533 queries/s

=== NeuralNetworkBloomFilter ===
Memory Usage:             4,224 bytes
False‐Positive Rate:      0.5400%
False‐Negative Rate:      0.0000%
Avg Query Time:           0.000079 s
Throughput:               12,601 queries/s

=== SandwichBloomFilter ===
Memory Usage:             272,320 bytes
False‐Positive Rate:      0.0200%
Avg Query Time:           0.000077 s
Throughput:               13,038 queries/s


Unnamed: 0,Filter Type,Memory (bytes),False Pos Rate,False Neg Rate,Avg Query Time (s),Throughput (q/s)
0,Standard Bloom Filter,1265728,0.0002,,2e-06,458193.576579
1,Counting Bloom Filter,1265728,0.0002,,2e-06,460532.967335
2,Neural Network Bloom Filter,4224,0.0054,0.0,7.9e-05,12601.126929
3,Sandwich Bloom Filter,272320,0.0002,,7.7e-05,13037.863015


In [28]:
# 📏 IP Address Dataset
print("\U0001f4cf IP Address Dataset Bloom Filter Evaluation\n")
with open("../datasets/ip_addresses/ip_address_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/ip_addresses/ip_addresses_negatives.txt") as f:
    negatives = f.read().splitlines()

filters_ip = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.01)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.01)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters_ip[2][1].train(positives, negatives)

results_ip = []
for name, bf in filters_ip:
    m = evaluate(bf, positives, negatives)
    results_ip.append({
        "Filter Type":      name,
        "Memory (bytes)":   m["memory_bytes"],
        "False Pos Rate":   m["fpr"],
        "False Neg Rate":   m["fnr"],
        "Avg Query Time (s)": m["avg_time"],
        "Throughput (q/s)": m["throughput"],
    })

df_ip = pd.DataFrame(results_ip)
display(df_ip)


📏 IP Address Dataset Bloom Filter Evaluation


=== StandardBloomFilter ===
Memory Usage:             3,067,328 bytes
False‐Positive Rate:      0.9625%
Avg Query Time:           0.000007 s
Throughput:               142,810 queries/s

=== CountingBloomFilter ===
Memory Usage:             3,067,328 bytes
False‐Positive Rate:      0.9625%
Avg Query Time:           0.000007 s
Throughput:               144,682 queries/s

=== NeuralNetworkBloomFilter ===
Memory Usage:             4,224 bytes
False‐Positive Rate:      5.9650%
False‐Negative Rate:      32.4825%
Avg Query Time:           0.000107 s
Throughput:               9,371 queries/s

=== SandwichBloomFilter ===
Memory Usage:             1,076,280 bytes
False‐Positive Rate:      1.9700%
Avg Query Time:           0.000086 s
Throughput:               11,652 queries/s


Unnamed: 0,Filter Type,Memory (bytes),False Pos Rate,False Neg Rate,Avg Query Time (s),Throughput (q/s)
0,Standard Bloom Filter,3067328,0.009625,,7e-06,142810.219009
1,Counting Bloom Filter,3067328,0.009625,,7e-06,144681.686278
2,Neural Network Bloom Filter,4224,0.05965,0.324825,0.000107,9371.197765
3,Sandwich Bloom Filter,1076280,0.0197,,8.6e-05,11651.694856


In [29]:
# 📧 Email Dataset
print("\U0001f4e7 Email Dataset Bloom Filter Evaluation\n")
with open("../datasets/emails/spam_email_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/emails/spam_email_negatives.txt") as f:
    negatives = f.read().splitlines()

filters_email = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.01)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.01)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters_email[2][1].train(positives, negatives)

results_email = []
for name, bf in filters_email:
    m = evaluate(bf, positives, negatives)
    results_email.append({
        "Filter Type":      name,
        "Memory (bytes)":   m["memory_bytes"],
        "False Pos Rate":   m["fpr"],
        "False Neg Rate":   m["fnr"],
        "Avg Query Time (s)": m["avg_time"],
        "Throughput (q/s)": m["throughput"],
    })

df_email = pd.DataFrame(results_email)
display(df_email)


📧 Email Dataset Bloom Filter Evaluation


=== StandardBloomFilter ===
Memory Usage:             52,712 bytes
False‐Positive Rate:      1.6012%
Avg Query Time:           0.000003 s
Throughput:               394,562 queries/s

=== CountingBloomFilter ===
Memory Usage:             52,712 bytes
False‐Positive Rate:      1.6012%
Avg Query Time:           0.000002 s
Throughput:               416,942 queries/s

=== NeuralNetworkBloomFilter ===
Memory Usage:             4,224 bytes
False‐Positive Rate:      4.2213%
False‐Negative Rate:      17.9300%
Avg Query Time:           0.000076 s
Throughput:               13,153 queries/s

=== SandwichBloomFilter ===
Memory Usage:             22,712 bytes
False‐Positive Rate:      1.0189%
Avg Query Time:           0.000075 s
Throughput:               13,258 queries/s




Unnamed: 0,Filter Type,Memory (bytes),False Pos Rate,False Neg Rate,Avg Query Time (s),Throughput (q/s)
0,Standard Bloom Filter,52712,0.016012,,3e-06,394562.076955
1,Counting Bloom Filter,52712,0.016012,,2e-06,416942.099262
2,Neural Network Bloom Filter,4224,0.042213,0.1793,7.6e-05,13153.089829
3,Sandwich Bloom Filter,22712,0.010189,,7.5e-05,13257.968381


In [30]:
# 📞 Phone Number Dataset
print("\U0001f4de Phone Number Dataset Bloom Filter Evaluation\n")
with open("../datasets/phone_numbers/phone_numbers_positives.txt") as f:
    positives = f.read().splitlines()
with open("../datasets/phone_numbers/phone_numbers_negatives.txt") as f:
    negatives = f.read().splitlines()

filters_phone = [
    ("Standard Bloom Filter", StandardBloomFilter(len(positives), 0.01)),
    ("Counting Bloom Filter", CountingBloomFilter(len(positives), 0.01)),
    ("Neural Network Bloom Filter", NeuralNetworkBloomFilter()),
    ("Sandwich Bloom Filter", SandwichBloomFilter(positives, negatives, 0.20))
]

filters_phone[2][1].train(positives, negatives)

results_phone = []
for name, bf in filters_phone:
    m = evaluate(bf, positives, negatives)
    results_phone.append({
        "Filter Type":      name,
        "Memory (bytes)":   m["memory_bytes"],
        "False Pos Rate":   m["fpr"],
        "False Neg Rate":   m["fnr"],
        "Avg Query Time (s)": m["avg_time"],
        "Throughput (q/s)": m["throughput"],
    })

df_phone = pd.DataFrame(results_phone)
display(df_phone)


📞 Phone Number Dataset Bloom Filter Evaluation






=== StandardBloomFilter ===
Memory Usage:             62,528 bytes
False‐Positive Rate:      1.5000%
Avg Query Time:           0.000002 s
Throughput:               453,536 queries/s

=== CountingBloomFilter ===
Memory Usage:             62,528 bytes
False‐Positive Rate:      1.5000%
Avg Query Time:           0.000003 s
Throughput:               399,191 queries/s

=== NeuralNetworkBloomFilter ===
Memory Usage:             4,224 bytes
False‐Positive Rate:      27.8000%
False‐Negative Rate:      34.8894%
Avg Query Time:           0.000080 s
Throughput:               12,529 queries/s

=== SandwichBloomFilter ===
Memory Usage:             26,144 bytes
False‐Positive Rate:      6.4000%
Avg Query Time:           0.000079 s
Throughput:               12,634 queries/s


Unnamed: 0,Filter Type,Memory (bytes),False Pos Rate,False Neg Rate,Avg Query Time (s),Throughput (q/s)
0,Standard Bloom Filter,62528,0.015,,2e-06,453536.33218
1,Counting Bloom Filter,62528,0.015,,3e-06,399191.396212
2,Neural Network Bloom Filter,4224,0.278,0.348894,8e-05,12528.987239
3,Sandwich Bloom Filter,26144,0.064,,7.9e-05,12633.90243
