In [1]:
!pip install -q pyspark bitarray kaggle

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.4/288.4 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
!kaggle datasets download vishesh1412/celebrity-face-image-dataset

Dataset URL: https://www.kaggle.com/datasets/vishesh1412/celebrity-face-image-dataset
License(s): CC0-1.0
Downloading celebrity-face-image-dataset.zip to /content
 78% 41.0M/52.9M [00:00<00:00, 116MB/s] 
100% 52.9M/52.9M [00:00<00:00, 109MB/s]


In [3]:
!unzip -q celebrity-face-image-dataset.zip

In [4]:
import math
import hashlib
from bitarray import bitarray

class BloomFilter:

    def __init__(self, n_items, fp_prob):
        '''
        n_items : int
            Number of items expected to be stored in bloom filter
        fp_prob : float
            False Positive probability in decimal
        '''
        # False possible probability in decimal
        self.fp_prob = fp_prob
        # Size of bit array to use
        self.size = self.get_size(n_items,fp_prob)

        # number of hash functions to use
        self.hash_count = self.get_hash_count(self.size,n_items)

        print(f"Using {self.hash_count} hash functions")

        # Bit array of given size
        self.bit_array = bitarray(self.size)

        # initialize all bits as 0
        self.bit_array.setall(0)

    def add(self, item):
        '''
        Add an item in the filter
        '''
        with open(item, "rb") as f:
            content = f.read()
        for i in range(self.hash_count):
            string = str(content) + str(i)
            digest = hashlib.md5(string.encode())
            # perform double hashing
            result = int(digest.hexdigest(), 16)
            bit = result % self.size
            self.bit_array[bit] = True

    def check(self, item):
        with open(item, "rb") as f:
            content = f.read()
        for i in range(self.hash_count):
            string = str(content) + str(i)
            digest = hashlib.md5(string.encode())
            result = int(digest.hexdigest(), 16)
            bit = result % self.size
            if self.bit_array[bit] == False:
                return False
        return True

    @classmethod
    def get_size(self,n,p):
        # Return the size of bit array(m) to be used

        m = -(n * math.log(p))/(math.log(2)**2)
        return int(m)

    @classmethod
    def get_hash_count(self, m, n):
        '''
        Return the hash function(k) to be used
        '''
        k = (m/n) * math.log(2)
        return int(k)

In [5]:
from random import shuffle, seed
from glob import glob
from sklearn.model_selection import train_test_split

In [6]:
all_images = sorted(glob("Celebrity Faces Dataset/Brad Pitt/*"))

In [7]:
seen_images, unseen_images = train_test_split(all_images, test_size=0.2, random_state=42)

In [8]:
len(seen_images), len(unseen_images)

(80, 20)

In [9]:
# seed(42)
n = len(seen_images)  # no of items to add
p = 0.05  # false positive probability

bloomf = BloomFilter(n,p)
print("Size of bit array: {}".format(bloomf.size))
print("False positive Probability: {}".format(bloomf.fp_prob))
print("Number of hash functions: {}".format(bloomf.hash_count))

for item in seen_images:
    # print(bloomf.bit_array)
    bloomf.add(item)

test_images = seen_images[-10:] + unseen_images
print("No. of test images:", len(test_images))
shuffle(test_images)

for img in test_images:
    if bloomf.check(img):
        if img in unseen_images:
            print("'{}' is a false positive!".format(img))
        else:
            print("'{}' is probably present!".format(img))
    else:
        print("'{}' is definitely not present!".format(img))

Using 4 hash functions
Size of bit array: 498
False positive Probability: 0.05
Number of hash functions: 4
No. of test images: 30
'Celebrity Faces Dataset/Brad Pitt/091_8561b34e.jpg' is definitely not present!
'Celebrity Faces Dataset/Brad Pitt/072_da45cf8f.jpg' is probably present!
'Celebrity Faces Dataset/Brad Pitt/005_02ab3a1b.jpg' is definitely not present!
'Celebrity Faces Dataset/Brad Pitt/046_8bf34269.jpg' is definitely not present!
'Celebrity Faces Dataset/Brad Pitt/019_ddcd5687.jpg' is definitely not present!
'Celebrity Faces Dataset/Brad Pitt/087_155f1f74.jpg' is probably present!
'Celebrity Faces Dataset/Brad Pitt/021_143b276f.jpg' is probably present!
'Celebrity Faces Dataset/Brad Pitt/071_2d51687a.jpg' is definitely not present!
'Celebrity Faces Dataset/Brad Pitt/078_b546dff5.jpg' is definitely not present!
'Celebrity Faces Dataset/Brad Pitt/001_c04300ef.jpg' is definitely not present!
'Celebrity Faces Dataset/Brad Pitt/084_4876da64.jpg' is definitely not present!
'Celebri