In [13]:
import hashlib
import time
import random
import math
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd


In [None]:
# Load positives from file
with open("datasets/url_positives.txt", "r") as f:
    positives = list(set(f.read().splitlines()))

# Load Majestic negatives
df = pd.read_csv("datasets/majestic_million.csv")
negatives = [domain for domain in df['Domain'][:40000]]

# Remove overlaps
negatives = list(set(negatives) - set(positives))

# Save negatives
with open("datasets/url_negatives.txt", "w") as f:
    for url in negatives:
        f.write(url + "\n")

# Create queries: 10k positives + 10k negatives
query_positives = random.sample(positives, 10000)
query_negatives = random.sample(negatives, 10000)
queries = query_positives + query_negatives
random.shuffle(queries)

with open("datasets/url_queries.txt", "w") as f:
    for url in queries:
        f.write(url + "\n")


In [5]:


class StandardBloomFilter:
    def __init__(self, n, fp_rate):
        self.size = self._get_size(n, fp_rate)
        self.hash_count = self._get_hash_count(self.size, n)
        self.bit_array = [0] * self.size

    def _hashes(self, item):
        return [hashlib.sha256(f"{item}{i}".encode()).hexdigest() for i in range(self.hash_count)]

    def _get_size(self, n, p):
        m = -(n * math.log(p)) / (math.log(2)**2)
        return int(m)

    def _get_hash_count(self, m, n):
        return int((m / n) * math.log(2))

    def add(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.bit_array[idx] = 1

    def query(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            if self.bit_array[idx] == 0:
                return False
        return True


In [2]:
class CountingBloomFilter:
    def __init__(self, n, fp_rate):
        self.size = self._get_size(n, fp_rate)
        self.hash_count = self._get_hash_count(self.size, n)
        self.count_array = [0] * self.size

    def _get_size(self, n, p):
        return int(-(n * math.log(p)) / (math.log(2)**2))

    def _get_hash_count(self, m, n):
        return int((m / n) * math.log(2))

    def add(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.count_array[idx] += 1

    def remove(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            self.count_array[idx] = max(0, self.count_array[idx] - 1)

    def query(self, item):
        for i in range(self.hash_count):
            idx = int(hashlib.md5(f"{item}{i}".encode()).hexdigest(), 16) % self.size
            if self.count_array[idx] == 0:
                return False
        return True


In [7]:

class NeuralNetworkBloomFilter:
    def __init__(self):
        self.model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=200)
        self.set_members = set()

    def _featurize(self, url):
        return np.array([ord(c) for c in url[:50]] + [0] * (50 - len(url))).reshape(1, -1)

    def train(self, positives, negatives):
        X = [self._featurize(x).flatten() for x in positives + negatives]
        y = [1]*len(positives) + [0]*len(negatives)
        self.model.fit(X, y)
        self.set_members = set(positives)

    def add(self, item): pass  # Not used

    def query(self, item):
        x = self._featurize(item)
        pred = self.model.predict(x)[0]
        if pred == 1:
            return True
        return False


In [8]:
def evaluate(bloom_filter, positives, negatives):
    # Insert positives
    for url in positives:
        bloom_filter.add(url)

    # Evaluate FPR and query time
    start = time.time()
    false_positives = 0
    for url in negatives:
        if bloom_filter.query(url):
            false_positives += 1
    elapsed = time.time() - start
    fpr = false_positives / len(negatives)
    avg_query_time = elapsed / len(negatives)
    return fpr, avg_query_time

In [None]:
# Load data
with open("datasets/url_positives.txt") as f:
    positives = f.read().splitlines()

with open("datasets/url_negatives.txt") as f:
    negatives = f.read().splitlines()

# Evaluate Standard Bloom Filter
sbf = StandardBloomFilter(n=len(positives), fp_rate=0.01)
fpr, time_sbf = evaluate(sbf, positives, negatives)
print("Standard Bloom Filter: FPR =", fpr, "Avg Query Time =", time_sbf)

# Evaluate Counting Bloom Filter
cbf = CountingBloomFilter(n=len(positives), fp_rate=0.01)
fpr, time_cbf = evaluate(cbf, positives, negatives)
print("Counting Bloom Filter: FPR =", fpr, "Avg Query Time =", time_cbf)

# Evaluate Neural Network Bloom Filter
nn_bf = NeuralNetworkBloomFilter()
nn_bf.train(positives, negatives)
fpr, time_nn = evaluate(nn_bf, positives, negatives)
print("Neural Network BF: FPR =", fpr, "Avg Query Time =", time_nn)
