In [8]:
import mmh3

class BloomFilter:
  def __init__(self, m, k, q):
      self.m = m  # Size of the filter
      self.k = k  # Number of hash functions
      self.q = q  # q-grams

      # Create a bit array of size m, initialized with zeros
      # This are going to be the bloom filter
      self.filter = [0] * m

  def add(self, name: str):
    for i in range(len(name) - self.q + 1):
      self._addQgram(name[i:i+self.q])

  def _addQgram(self, qgram):
    # Double hash q-gram and set corresponding bits to 1
    for i in range(1, self.k + 1):
        hash1 = mmh3.hash(qgram)
        hash2 = mmh3.hash(qgram, 1)
        index = (hash1 + i * hash2) % self.m
        self.filter[index] = 1

  def check(self, name: str):
      if len(name) % 2 != 0:
        name += "_"
      for i in range(len(name) - self.q + 1):
        if not self._checkQgram(name[i:i+self.q]):
          return False
      return True

  def _checkQgram(self, qgram):
      # Check if all bits are set to 1 for the given q-gram
      for i in range(1, self.k + 1):
          hash1 = mmh3.hash(qgram)
          hash2 = mmh3.hash(qgram, 1)
          index = (hash1 + i * hash2) % self.m
          if self.filter[index] == 0:
              return False
      return True


# Define parameters
m = 10000  # Size of the filter
k = 5     # Number of hash functions
q = 2     # Number of q-grams

# Create and initialize the Bloom Filter
bloom_filter = BloomFilter(m, k, q)
bloom_filter.add("Alex")
bloom_filter.add("Pedro")

# Check if a q-gram is in the filter
print("Alex:",bloom_filter.check("Alex"))
print("Anthea:",bloom_filter.check("Anthea"))

Alex: True
Anthea: False


In [9]:
import pandas as pd
import numpy as np

# - Dataset example names
en = pd.read_csv('pprl-attack-data/example-names.csv')
print(en)

# - German names
gn = pd.read_csv('pprl-attack-data/german-names.csv')
print(gn)

     name  frequency
0  Daniel        242
1  Carlos        130
2  Danilo        115
3   Carla         48
4   David         95
5   Carol         20
           name  frequency
0         Peter    1293922
1       Michael    1088118
2      Wolfgang    1075005
3        Thomas    1011277
4         Klaus     992990
..          ...        ...
995        Ulli       4095
996  Malgorzata       4091
997        Insa       4059
998    Mohammed       4041
999       Alexa       4030

[1000 rows x 2 columns]


In [14]:
import random

def genDataFrame(df, s):
  # - Generate data frame of length l using the names
  #   in df with its corresponding f.

  samples = []

  for _, row in df.iterrows():
      name = row['name']
      f = row['frequency']

      samples.extend([name] * f)

  data = np.random.choice(samples, size=s, replace=True)

  df_generated = pd.DataFrame({'name': data})

  return df_generated

def selectNnames(df,n):
  # - n random rows of a dataframe.
  sampled_df = df.sample(n=n, random_state=random.seed())

  return sampled_df


df = selectNnames(gn,5)
print(list(df['name'].unique()))
df = genDataFrame(df,1000)
print(list(df['name'].unique()))

['Heinz-Dieter', 'Agnes', 'Mirjam', 'Mark', 'Sonja']
['Sonja', 'Mirjam', 'Agnes', 'Mark', 'Heinz-Dieter']


In [15]:
import math

def computeQgrams(name: str, q: int) -> list:
  q_grams = []
  for i in range(len(name) - q + 1):
        q_grams.append(name[i:i+q])

  return q_grams

# - Parameters selected by the user.
n = 500
# - Optimal parameters.
epsilon = 5e-8
m = round(-n*math.log(epsilon)/(math.log(2) ** 2))
k = round(m/n*math.log(2))

print('Epsilon =',epsilon)
print('n =',n)
print('m =',m)
print('k =',k)
print()

df = selectNnames(gn,n)
print(list(df['name'].unique()))
df = genDataFrame(df,s=1000)
print(df.sample(5))

q = 2

filters = []
freqs = []
names = []
qgrams = []

for name in df['name'].unique():
  bf = BloomFilter(m, k, q)
  bf.add(name)
  filters.append(bf.filter)
  freqs.append((df['name']==name).sum())
  names.append(name)
  qgrams.append(computeQgrams(name, q))

df4attack = pd.DataFrame(data={'name': names, 'frequency': freqs, 'q-grams': qgrams, 'filter': filters})
df4attack = df4attack.sort_values(by='frequency', ascending=False)
df4attack.head(10)  

Epsilon = 5e-08
n = 500
m = 17495
k = 24

['Sönke', 'Karl-Heinrich', 'Karlheinz', 'Eugen', 'Daniel', 'Evi', 'Bernadette', 'Kerstin', 'Madeleine', 'Bernd', 'Nathalie', 'Juri', 'Gretel', 'Walter', 'Otto', 'Antonie', 'Viktor', 'Carlo', 'Thomas', 'Mona', 'Dana', 'Eveline', 'Rose-Marie', 'Wiltrud', 'Antonius', 'Maik', 'Aloisia', 'Marianne', 'Tino', 'Marietta', 'Iris', 'Detlef', 'Franziska', 'Theresia', 'Ludwig', 'Tina', 'Eckard', 'Margarita', 'Rainer', 'Fridolin', 'Claus-Peter', 'Heinz-Werner', 'Sabrina', 'Hans-Georg', 'Benedikt', 'Else', 'Björn', 'Steffen', 'Kati', 'Marga', 'Lilly', 'Cindy', 'Natalja', 'Hans-Walter', 'Brigitte', 'Marija', 'Jens', 'Auguste', 'Angelina', 'Viktoria', 'Dietlinde', 'Horst-Dieter', 'Rose', 'Dietrich', 'Anastasia', 'Jakob', 'Valerie', 'Imke', 'Andre', 'Irmhild', 'Michaela', 'Karl-Josef', 'Emilie', 'Tatjana', 'Olaf', 'Jeanette', 'Ernestine', 'Adele', 'Nina', 'Evelin', 'Karina', 'Heinz-Georg', 'Traudel', 'Christian', 'Edmund', 'Hans-Hermann', 'Annett', 'Carina', 'K

Unnamed: 0,name,frequency,q-grams,filter
17,Peter,28,"[Pe, et, te, er]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
97,Michael,28,"[Mi, ic, ch, ha, ae, el]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
68,Thomas,21,"[Th, ho, om, ma, as]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,Manfred,20,"[Ma, an, nf, fr, re, ed]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40,Wolfgang,20,"[Wo, ol, lf, fg, ga, an, ng]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
71,Klaus,17,"[Kl, la, au, us]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
0,Elisabeth,17,"[El, li, is, sa, ab, be, et, th]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Andrea,17,"[An, nd, dr, re, ea]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
139,Hildegard,15,"[Hi, il, ld, de, eg, ga, ar, rd]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13,Ingrid,15,"[In, ng, gr, ri, id]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
# Candidates q-grams
cand_qgs = []

# Computing candidates for each position in filter's length
for i in range(0,m):
    pos_cands = []
    neg_cands = []
    for _, row in df4attack.iterrows():
        if row['filter'][i] == 1:
            pos_cands = list(set(pos_cands + row['q-grams']))
        else:
            neg_cands = list(set(neg_cands + row['q-grams']))
    cand_qgs.append(list(set(pos_cands)-set(neg_cands)))
    #print(list(set(pos_cands)-set(neg_cands)))

In [18]:
# Re-identification

def getPos1(filter: list) -> list:
    pos1 = []
    for pos,i in enumerate(filter):
        if i == 1:
            pos1.append(pos)
    return pos1

cand_names = []
for filter in df4attack['filter']:
    cn = list(df4attack['name'])
    for pos in getPos1(filter):
        for _, row in df4attack[['name', 'q-grams']].iterrows():
            if not any(element in cand_qgs[pos] for element in row['q-grams']):
                try:
                    cn.remove(row['name'])
                except:
                    continue
    cand_names.append(cn)


# Candidate Names:
for pos,i in enumerate(df4attack['name']):
    print(pos,"-",cand_names[pos],i)

0 - ['Peter', 'Hans-Peter', 'Klaus-Peter'] Peter
1 - ['Michael', 'Michaela'] Michael
2 - ['Thomas'] Thomas
3 - ['Manfred'] Manfred
4 - ['Wolfgang'] Wolfgang
5 - ['Klaus', 'Klaus-Peter', 'Klaus-Dieter'] Klaus
6 - ['Elisabeth'] Elisabeth
7 - ['Andrea'] Andrea
8 - ['Hildegard'] Hildegard
9 - ['Ingrid'] Ingrid
10 - ['Barbara'] Barbara
11 - ['Horst'] Horst
12 - ['Helmut'] Helmut
13 - ['Heike'] Heike
14 - ['Christian'] Christian
15 - ['Bernd'] Bernd
16 - ['Herbert'] Herbert
17 - ['Waltraud'] Waltraud
18 - ['Hermann', 'Hermann-Josef', 'Hans-Hermann', 'Heinz-Hermann'] Hermann
19 - ['Brigitte'] Brigitte
20 - ['Rainer'] Rainer
21 - ['Katharina'] Katharina
22 - ['Jutta'] Jutta
23 - ['Siegfried'] Siegfried
24 - ['Anton'] Anton
25 - ['Helga'] Helga
26 - ['Birgit'] Birgit
27 - ['Hans', 'Hans-Peter', 'Hans-Günter', 'Hans-Dieter', 'Hans-Georg', 'Hans-Hermann'] Hans
28 - ['Friedrich'] Friedrich
29 - ['Hannelore'] Hannelore
30 - ['Kerstin'] Kerstin
31 - ['Frank'] Frank
32 - ['Johanna'] Johanna
33 - ['Vo

In [None]:
df4attack[['name', 'q-grams']]

Unnamed: 0,name,q-grams
0,Jens,"[Je, en, ns]"
4,Theo,"[Th, he, eo]"
3,Hans-Georg,"[Ha, an, ns, s-, -G, Ge, eo, or, rg]"
2,Simon,"[Si, im, mo, on]"
7,Nils,"[Ni, il, ls]"
6,Enrico,"[En, nr, ri, ic, co]"
5,Auguste,"[Au, ug, gu, us, st, te]"
1,Claus-Dieter,"[Cl, la, au, us, s-, -D, Di, ie, et, te, er]"
8,Wally,"[Wa, al, ll, ly]"
9,William,"[Wi, il, ll, li, ia, am]"


In [None]:
for _,row in df4attack[['name', 'q-grams']].iterrows():
    print(row['name'], type(row['q-grams']))

Jens <class 'list'>
Theo <class 'list'>
Hans-Georg <class 'list'>
Simon <class 'list'>
Nils <class 'list'>
Enrico <class 'list'>
Auguste <class 'list'>
Claus-Dieter <class 'list'>
Wally <class 'list'>
William <class 'list'>


In [None]:
list(df4attack['name'])


['Jens',
 'Theo',
 'Hans-Georg',
 'Simon',
 'Nils',
 'Enrico',
 'Auguste',
 'Claus-Dieter',
 'Wally',
 'William']