In [34]:
import mmh3

class BloomFilter:
  def __init__(self, m, k, q):
      self.m = m  # Size of the filter
      self.k = k  # Number of hash functions
      self.q = q  # q-grams

      # Create a bit array of size m, initialized with zeros
      # This are going to be the bloom filter
      self.filter = [0] * m

  def add(self, name: str):
    for i in range(len(name) - self.q + 1):
      self._addQgram(name[i:i+self.q])

  def _addQgram(self, qgram):
    # Double hash q-gram and set corresponding bits to 1
    for i in range(1, self.k + 1):
        hash1 = mmh3.hash(qgram)
        hash2 = mmh3.hash(qgram, 1)
        index = (hash1 + i * hash2) % self.m
        self.filter[index] = 1

  def check(self, name: str):
      if len(name) % 2 != 0:
        name += "_"
      for i in range(len(name) - self.q + 1):
        if not self._checkQgram(name[i:i+self.q]):
          return False
      return True

  def _checkQgram(self, qgram):
      # Check if all bits are set to 1 for the given q-gram
      for i in range(1, self.k + 1):
          hash1 = mmh3.hash(qgram)
          hash2 = mmh3.hash(qgram, 1)
          index = (hash1 + i * hash2) % self.m
          if self.filter[index] == 0:
              return False
      return True


# Define parameters
m = 10000  # Size of the filter
k = 5     # Number of hash functions
q = 2     # Number of q-grams

# Create and initialize the Bloom Filter
bloom_filter = BloomFilter(m, k, q)
bloom_filter.add("Alex")
bloom_filter.add("Pedro")

# Check if a q-gram is in the filter
print("Alex:",bloom_filter.check("Alex"))
print("Anthea:",bloom_filter.check("Anthea"))

Alex: True
Anthea: False


In [3]:
import pandas as pd
import numpy as np

# - Dataset example names
en = pd.read_csv('pprl-attack-data/example-names.csv')
print(en)

# - German names
gn = pd.read_csv('pprl-attack-data/german-names.csv')
print(gn)

     name  frequency
0  Daniel        242
1  Carlos        130
2  Danilo        115
3   Carla         48
4   David         95
5   Carol         20
           name  frequency
0         Peter    1293922
1       Michael    1088118
2      Wolfgang    1075005
3        Thomas    1011277
4         Klaus     992990
..          ...        ...
995        Ulli       4095
996  Malgorzata       4091
997        Insa       4059
998    Mohammed       4041
999       Alexa       4030

[1000 rows x 2 columns]


In [4]:
import random

def genDataFrame(df, s):
  # - Generate data frame of length l using the names
  #   in df with its corresponding f.

  samples = []

  for _, row in df.iterrows():
      name = row['name']
      f = row['frequency']

      samples.extend([name] * f)

  data = np.random.choice(samples, size=s, replace=True)

  df_generated = pd.DataFrame({'name': data})

  return df_generated

def selectNnames(df,n):
  # - n random rows of a dataframe.
  sampled_df = df.sample(n=n, random_state=random.seed())

  return sampled_df


df = selectNnames(gn,5)
print(list(df['name'].unique()))
df = genDataFrame(df,1000)
print(list(df['name'].unique()))

['Anni', 'Joseph', 'Nora', 'Luzie', 'René']
['Anni', 'Nora', 'Luzie', 'Joseph', 'René']


In [35]:
import math

def computeQgrams(name: str, q: int) -> list:
  q_grams = []
  for i in range(len(name) - q + 1):
        q_grams.append(name[i:i+q])

  return q_grams

# - Parameters selected by the user.
n = 10
# - Optimal parameters.
epsilon = 5e-8
m = round(-n*math.log(epsilon)/(math.log(2) ** 2))
k = round(m/n*math.log(2))

print('Epsilon =',epsilon)
print('n =',n)
print('m =',m)
print('k =',k)
print()

df = selectNnames(gn,n)
print(list(df['name'].unique()))
df = genDataFrame(df,l=1000)
print(df.sample(5))

q = 2

filters = []
freqs = []
names = []
qgrams = []

for name in df['name'].unique():
  bf = BloomFilter(m, k, q)
  bf.add(name)
  filters.append(bf.filter)
  freqs.append((df['name']==name).sum())
  names.append(name)
  qgrams.append(computeQgrams(name, q))

df4attack = pd.DataFrame(data={'name': names, 'frequency': freqs, 'q-grams': qgrams, 'filter': filters})
df4attack = df4attack.sort_values(by='frequency', ascending=False)
df4attack.head(10)  

Epsilon = 5e-08
n = 10
m = 350
k = 24

['William', 'Enrico', 'Auguste', 'Theo', 'Jens', 'Wally', 'Hans-Georg', 'Nils', 'Claus-Dieter', 'Simon']
           name
54         Jens
738        Jens
922  Hans-Georg
482        Jens
463        Theo


Unnamed: 0,name,frequency,q-grams,filter
0,Jens,523,"[Je, en, ns]","[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, ..."
4,Theo,136,"[Th, he, eo]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,Hans-Georg,113,"[Ha, an, ns, s-, -G, Ge, eo, or, rg]","[0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, ..."
2,Simon,86,"[Si, im, mo, on]","[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,Nils,48,"[Ni, il, ls]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ..."
6,Enrico,34,"[En, nr, ri, ic, co]","[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ..."
5,Auguste,18,"[Au, ug, gu, us, st, te]","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, ..."
1,Claus-Dieter,15,"[Cl, la, au, us, s-, -D, Di, ie, et, te, er]","[0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, ..."
8,Wally,14,"[Wa, al, ll, ly]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,William,13,"[Wi, il, ll, li, ia, am]","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, ..."


In [80]:
# Candidates q-grams
cand_qgs = []

# Computing candidates for each position in filter's length
for i in range(0,m):
    pos_cands = []
    neg_cands = []
    for _, row in df4attack.iterrows():
        if row['filter'][i] == 1:
            pos_cands = list(set(pos_cands + row['q-grams']))
        else:
            neg_cands = list(set(neg_cands + row['q-grams']))
    cand_qgs.append(list(set(pos_cands)-set(neg_cands)))
    print(list(set(pos_cands)-set(neg_cands)))

[]
['or', 'Ha', 'an', 'Ge', '-G', 'rg']
[]
['et', 'st', 'au', 'te', 'im', 'us', 'mo', 'la', 'Si', 'gu', 'ie', 'Au', 'Cl', 'on', 'ug', '-D', 'Di', 'er']
['Je', 'mo', 'or', 'Ha', 'ns', 'an', 'Si', 'en', 'Ni', 'Ge', '-G', 'on', 'rg', 'im', 'ls']
['ly', 'eo', 'an', 'Au', 'Th', 'en', 'am', 'Je', 'he', 'Ha', 'or', 'al', 'Ge', '-G', 'im', 'mo', 'Si', 'ia', 'on', 'rg', 'ug', 'Wi', 'Wa', 'll', 'gu', 'ns', 'li', 'st']
['En', 'ic', 'Je', 'or', 'Ha', 'ns', 'an', 'nr', 'co', 'en', 'Ge', '-G', 'ri', 'rg']
['et', 'Wi', 'au', 'am', 'la', 'ia', 'ie', 'Cl', 'li', '-D', 'Di', 'er']
['En', 'ic', 'or', 'Ha', 'nr', 'an', 'co', 'Ge', '-G', 'ri', 'rg']
['et', 's-', 'au', 'la', 'or', 'Ha', 'ie', 'an', 'Cl', 'Ge', '-G', 'rg', '-D', 'Di', 'er']
['et', 's-', 'us', 'an', 'Au', 'Cl', 'ls', 'te', 'am', 'la', 'or', 'Ha', 'ie', 'il', 'Ge', '-G', 'Ni', '-D', 'au', 'ia', 'rg', 'ug', 'Di', 'Wi', 'gu', 'li', 'st', 'er']
['Th', 'he']
['et', 'us', 'Au', 'Cl', 'ls', 'te', 'am', 'la', 'ie', 'il', 'Ni', '-D', 'au', 'ia', 'ug',

In [91]:
# Re-identification

def getPos1(filter: list) -> list:
    pos1 = []
    for pos,i in enumerate(filter):
        if i == 1:
            pos1.append(pos)
    return pos1

cand_names = []
for filter in df4attack['filter']:
    cn = list(df4attack['name'])
    for pos in getPos1(filter):
        for _, row in df4attack[['name', 'q-grams']].iterrows():
            if not any(element in cand_qgs[pos] for element in row['q-grams']):
                try:
                    cn.remove(row['name'])
                except:
                    continue
    cand_names.append(cn)


# Candidate Names:
for pos,i in enumerate(df4attack['name']):
    print(pos,"-",cand_names[pos],i)

0 - ['Jens'] Jens
1 - ['Theo'] Theo
2 - ['Hans-Georg'] Hans-Georg
3 - ['Simon'] Simon
4 - ['Nils'] Nils
5 - ['Enrico'] Enrico
6 - ['Auguste'] Auguste
7 - ['Claus-Dieter'] Claus-Dieter
8 - ['Wally'] Wally
9 - ['William'] William


In [69]:
df4attack[['name', 'q-grams']]

Unnamed: 0,name,q-grams
0,Jens,"[Je, en, ns]"
4,Theo,"[Th, he, eo]"
3,Hans-Georg,"[Ha, an, ns, s-, -G, Ge, eo, or, rg]"
2,Simon,"[Si, im, mo, on]"
7,Nils,"[Ni, il, ls]"
6,Enrico,"[En, nr, ri, ic, co]"
5,Auguste,"[Au, ug, gu, us, st, te]"
1,Claus-Dieter,"[Cl, la, au, us, s-, -D, Di, ie, et, te, er]"
8,Wally,"[Wa, al, ll, ly]"
9,William,"[Wi, il, ll, li, ia, am]"


In [71]:
for _,row in df4attack[['name', 'q-grams']].iterrows():
    print(row['name'], type(row['q-grams']))

Jens <class 'list'>
Theo <class 'list'>
Hans-Georg <class 'list'>
Simon <class 'list'>
Nils <class 'list'>
Enrico <class 'list'>
Auguste <class 'list'>
Claus-Dieter <class 'list'>
Wally <class 'list'>
William <class 'list'>


In [65]:
list(df4attack['name'])


['Jens',
 'Theo',
 'Hans-Georg',
 'Simon',
 'Nils',
 'Enrico',
 'Auguste',
 'Claus-Dieter',
 'Wally',
 'William']