In [None]:
import hashlib
import time
import random
import math
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd
from IPython.display import display


In [3]:
class StandardBloomFilter:
    def __init__(self, n, fp_rate):
        self.size = self._get_size(n, fp_rate)
        self.hash_count = self._get_hash_count(self.size, n)
        self.bit_array = [0] * self.size

    def _hashes(self, item):
        return [hashlib.sha256(f"{item}{i}".encode()).hexdigest() for i in range(self.hash_count)]

    def _get_size(self, n, p):
        m = -(n * math.log(p)) / (math.log(2)**2)
        return int(m)

    def _get_hash_count(self, m, n):
        return int((m / n) * math.log(2))

    def add(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.bit_array[idx] = 1

    def query(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            if self.bit_array[idx] == 0:
                return False
        return True


In [4]:
class CountingBloomFilter:
    def __init__(self, n, fp_rate):
        self.size = self._get_size(n, fp_rate)
        self.hash_count = self._get_hash_count(self.size, n)
        self.count_array = [0] * self.size

    def _get_size(self, n, p):
        return int(-(n * math.log(p)) / (math.log(2)**2))

    def _get_hash_count(self, m, n):
        return int((m / n) * math.log(2))

    def add(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.count_array[idx] += 1

    def remove(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.count_array[idx] = max(0, self.count_array[idx] - 1)

    def query(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            if self.count_array[idx] == 0:
                return False
        return True


In [5]:

class NeuralNetworkBloomFilter:
    def __init__(self):
        self.model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=200)
        self.set_members = set()

    def _featurize(self, url):
        return np.array([ord(c) for c in url[:50]] + [0] * (50 - len(url))).reshape(1, -1)

    def train(self, positives, negatives):
        X = [self._featurize(x).flatten() for x in positives + negatives]
        y = [1]*len(positives) + [0]*len(negatives)
        self.model.fit(X, y)
        self.set_members = set(positives)

    def add(self, item): pass  # Not used

    def query(self, item):
        x = self._featurize(item)
        pred = self.model.predict(x)[0]
        if pred == 1:
            return True
        return False


In [18]:
def evaluate(bloom_filter, positives, negatives):
    # Insert positives
    for url in positives:
        bloom_filter.add(url)

    # Evaluate FPR and query time
    start = time.time()
    false_positives = 0
    for url in negatives:
        if bloom_filter.query(url) and url not in positives:
            false_positives += 1
    elapsed = time.time() - start
    fpr = false_positives / len(negatives)
    avg_query_time = elapsed / len(negatives)
    return fpr, avg_query_time

In [19]:
print("🌐 URL Dataset Bloom Filter Evaluation\n")

# Load data
with open("../datasets/urls/url_positives.txt") as f:
    positives = f.read().splitlines()

with open("../datasets/urls/url_negatives.txt") as f:
    negatives = f.read().splitlines()

# Evaluate Standard Bloom Filter
sbf = StandardBloomFilter(n=len(positives), fp_rate=0.20)
fpr_sbf, time_sbf = evaluate(sbf, positives, negatives)

# Evaluate Counting Bloom Filter
cbf = CountingBloomFilter(n=len(positives), fp_rate=0.20)
fpr_cbf, time_cbf = evaluate(cbf, positives, negatives)

# Evaluate Neural Network Bloom Filter
nn_bf = NeuralNetworkBloomFilter()
nn_bf.train(positives, negatives)
fpr_nn, time_nn = evaluate(nn_bf, positives, negatives)

# Format results in a table
results_url = pd.DataFrame({
    "Filter Type": ["Standard Bloom Filter", "Counting Bloom Filter", "Neural Network Bloom Filter"],
    "False Positive Rate": [fpr_sbf, fpr_cbf, fpr_nn],
    "Average Query Time (s)": [time_sbf, time_cbf, time_nn]
})

display(results_url)


🌐 URL Dataset Bloom Filter Evaluation



Unnamed: 0,Filter Type,False Positive Rate,Average Query Time (s)
0,Standard Bloom Filter,0.203775,8.8e-05
1,Counting Bloom Filter,0.203775,8.8e-05
2,Neural Network Bloom Filter,0.334175,0.000223


In [22]:
print("🔐 Password Dataset Bloom Filter Evaluation\n")

# Load password dataset
with open("../datasets/passwords/password_positives.txt") as f:
    pw_positives = f.read().splitlines()

with open("../datasets/passwords/password_negatives.txt") as f:
    pw_negatives = f.read().splitlines()

# Standard Bloom Filter
sbf_pw = StandardBloomFilter(n=len(pw_positives), fp_rate=0.0005)
fpr_sbf_pw, time_sbf_pw = evaluate(sbf_pw, pw_positives, pw_negatives)

# Counting Bloom Filter
cbf_pw = CountingBloomFilter(n=len(pw_positives), fp_rate=0.0005)
fpr_cbf_pw, time_cbf_pw = evaluate(cbf_pw, pw_positives, pw_negatives)

# Neural Network Bloom Filter
nn_bf_pw = NeuralNetworkBloomFilter()
nn_bf_pw.train(pw_positives, pw_negatives)
fpr_nn_pw, time_nn_pw = evaluate(nn_bf_pw, pw_positives, pw_negatives)

# Format results in a table
results_pw = pd.DataFrame({
    "Filter Type": ["Standard Bloom Filter", "Counting Bloom Filter", "Neural Network Bloom Filter"],
    "False Positive Rate": [fpr_sbf_pw, fpr_cbf_pw, fpr_nn_pw],
    "Average Query Time (s)": [time_sbf_pw, time_cbf_pw, time_nn_pw]
})

display(results_pw)


🔐 Password Dataset Bloom Filter Evaluation



Unnamed: 0,Filter Type,False Positive Rate,Average Query Time (s)
0,Standard Bloom Filter,0.0002,2e-06
1,Counting Bloom Filter,0.0002,2e-06
2,Neural Network Bloom Filter,0.001,8.1e-05
