# RAPPOR original repo (Python & R)

In [6]:
class Params(object):
  """RAPPOR encoding parameters.

  These affect privacy/anonymity.  See the paper for details.
  """
  def __init__(self):
    self.num_bloombits = 16      # Number of bloom filter bits (k)
    self.num_hashes = 2          # Number of bloom filter hashes (h)
    self.num_cohorts = 2        # Number of cohorts (m)
    self.prob_p = 0.50           # Probability p
    self.prob_q = 0.75           # Probability q
    self.prob_f = 0.50           # Probability f

params = Params()

## Sum bits

In [14]:
import csv
import sys
from io import StringIO

def SumBits(params, stdin, stdout):
  csv_in = csv.reader(stdin)
  csv_out = csv.writer(stdout)

  num_cohorts = params.num_cohorts
  num_bloombits = params.num_bloombits

  sums = [[0] * num_bloombits for _ in range(num_cohorts)]
  num_reports = [0] * num_cohorts

  for i, row in enumerate(csv_in):
    try:
      (user_id, cohort, unused_bloom, unused_prr, irr) = row # irr is instant random response; prr is permanent random response
    except ValueError:
      raise RuntimeError('Error parsing row %r' % row)

    if i == 0:
      continue  # skip header

    cohort = int(cohort)
    num_reports[cohort] += 1
    #print("Number of reports:", num_reports)

    if not len(irr) == params.num_bloombits:
      raise RuntimeError(
          "Expected %d bits, got %r" % (params.num_bloombits, len(irr)))
    # sums bits in reverse
    for i, c in enumerate(irr): # i = index; c = bit
      bit_num = num_bloombits - i - 1  # e.g. char 0 = bit 15, char 15 = bit 0 
      if c == '1':
        sums[cohort][bit_num] += 1
      else:
        if c != '0':
          raise RuntimeError('Invalid IRR -- digits should be 0 or 1')

  for cohort in range(num_cohorts):
    # First column is the total number of reports in the cohort.
    row = [num_reports[cohort]] + sums[cohort] # first column is number of reports in cohort
    csv_out.writerow(row)

CSV_IN = """\
user_id,cohort,bloom,prr,rappor
5,1,dummy,dummy,0000111100001111
5,1,dummy,dummy,0000000000111100
3,0,dummy,dummy,0001010000111100
"""

EXPECTED_CSV_OUT = """\
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r
2,1,1,2,2,1,1,0,0,1,1,1,1,0,0,0,0\r
"""

stdin = StringIO(CSV_IN)
stdout = StringIO()

SumBits(params, stdin, stdout)
#print(CSV_IN)
#print(stdout.getvalue())

> Notes
- Cohorts == N of csv rows

> There are 17 columns. The left-most column is the total number of reports in the cohort. The remaining 16 columns correspond to the k = 16 bits in the Bloom filter. Each column contains the number of reports with that bit set to 1.

In [67]:
import struct, hashlib

def get_bloom_bits(word, cohort, num_hashes, num_bloombits):
  """Return an array of bits to set in the bloom filter.

  In the real report, we bitwise-OR them together.  In hash candidates, we put
  them in separate entries in the "map" matrix.
  """
  value = struct.pack('>L', cohort) + str.encode(word)  # Encode word for each cohort. Cohort is 4 byte prefix.
  #print(value)
  md5 = hashlib.md5(value)

  digest = md5.digest()
  #print(word, md5.hexdigest())


  # Each has is a byte, which means we could have up to 256 bit Bloom filters.
  # There are 16 bytes in an MD5, in which case we can have up to 16 hash
  # functions per Bloom filter.
  if num_hashes > len(digest):
    raise RuntimeError("Can't have more than %d hashes" % md5)

  #log('hash_input %r', value)
  #log('Cohort %d', cohort)
  #log('MD5 %s', md5.hexdigest())
  #print(digest[0]) # for each of the hashes take value out of md5 created

  return [digest[i] % num_bloombits for i in range(num_hashes)]

def HashCandidates(params, stdin, stdout):
  num_bloombits = params.num_bloombits
  csv_out = csv.writer(stdout)

  for line in stdin:
    word = line.strip()
    row = [word]
    for cohort in range(params.num_cohorts):
      bloom_bits = get_bloom_bits(word, cohort, params.num_hashes,
                                         num_bloombits) # called N words * N cohorts
      #print(bloom_bits)
      for bit_to_set in bloom_bits:
        # bits are indexed from 1.  Add a fixed offset for each cohort.
        # NOTE: This detail could be omitted from the map file format, and done
        # in R.
        row.append(cohort * num_bloombits + (bit_to_set + 1)) # called N cohorts * N num_hashes
      
      #print (bloom_bits)
      bloom = 0
      for bit_to_set in bloom_bits:
        bloom |= (1 << bit_to_set)

      #print(bloom)
      s = ''
      bits = []
      for bit_num in range(num_bloombits):
        if bloom & (1 << bit_num):
          bits.append('1')
        else:
          bits.append('0')
      encoded_bloom = ''.join(reversed(bits))
      print(encoded_bloom)

    csv_out.writerow(row)


STDIN = """\
apple
banana
carrot
"""

EXPECTED_CSV_OUT = """\
apple,5,1,26,26,38,34,63,62\r
banana,12,14,28,24,37,34,62,49\r
carrot,4,12,25,21,48,38,61,54\r
"""


stdin = StringIO(STDIN)
stdout = StringIO()

HashCandidates(params, stdin, stdout)
print(stdout.getvalue())



0000000000010001
0000001000000000
0010100000000000
0000100010000000
0000100000001000
0000000100010000
apple,5,1,26,26
banana,12,14,28,24
carrot,4,12,25,21



The map file has one row per candidate. In this case, there are 60 rows: 50 for the true values and 10 for "fake" values, which make the candidates a superset of the true input.

The left most column is the raw candidate string. Then there are 128 more columns: for m = 64 cohorts times k = 2 hash functions in the Bloom filter.

In [40]:
globals().keys()

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', 'open', '_', '__', '___', '__vsc_ipynb_file__', '_i', '_ii', '_iii', '_i1', 'Params', 'params', '_i2', 'csv', 'sys', 'SumBits', '_2', '_i3', 'StringIO', 'CSV_IN', 'EXPECTED_CSV_OUT', '_i4', 'stdin', 'stdout', '_4', '_i5', '_i6', '_i7', '_i8', '_i9', '_i10', '_i11', '_i12', '_i13', '_i14', '_i15', 'struct', 'hashlib', 'get_bloom_bits', 'HashCandidates', 'STDIN', '_i16', '_i17', '_i18', '_i19', '_i20', '_i21', '_i22', '_i23', '_i24', '_i25', '_i26', '_i27', '_i28', '_i29', '_i30', '_i31', '_i32', '_i33', '_i34', '_i35', '_i36', '_i37', '_i38', '_i39', '_i40'])

# Pure LDP
This is an attempt to replicate RAPPOR implementation by https://github.com/Samuel-Maddock/pure-LDP


## Notes
### Client
```
client = create_fo_client_instance(self.name, self.client_params)

if not self.memory_safe:
    for i in range(0, len(data)):
        ldp_data.append(client.privatise(data[i]))
```

### Server
```
for index, item in enumerate(ldp_data):
    server.aggregate(item)

ldp_freq = server.estimate_all(domain, normalization=self.normalization)
```
...
```
def estimate_all:
    estimates = np.array([self.estimate(x, suppress_warnings=suppress_warnings) for x in data_list])
```

## Params

In [43]:
num_bloombits = 128  # m - Max size is 256 bits
num_hashes = 2  # k - Recommended to use 2 hashes
num_of_cohorts = 8  # Max cohorts is 64



## Privatize data

In [55]:
import random
import xxhash
rappor_params = {"server_params": {"f": 0.64, "m": 10, "k": 2}, "client_params": {"f": 0.64, "m": 10}}

hash_family = []
for i in range(0, num_of_cohorts):
    hash_family.append([
        lambda data: xxhash.xxh64(str(data), seed=random.randint(0, sys.maxsize)).intdigest() % rappor_params['server_params']['m'] 
        for i in range(0, rappor_params['server_params']['k'])
        ])

def index_mapper(x):
    return x - 1

def privatise(data, debug=False):
    # Create empty bloom; flip one bit per hash func (determined by k)

    index = index_mapper(data)
    cohort_num = random.randint(0, num_of_cohorts-1)
    b = [0]*rappor_params['server_params']['m'] # empty bloom
    hash_funcs = hash_family[cohort_num]
    for func in hash_funcs:
        hash_index = func(str(index)) # flip 
        b[hash_index] = 1
    if debug: print(b) 
    
    for i,bit in enumerate(b):
        u = random.random()
        if (bit == 1 and u < (1-0.5*rappor_params['client_params']['f'])) or (bit == 0 and u < 1/2*rappor_params['client_params']['f']):
            b[i] = 1
    if debug: print(b)


    return b, cohort_num

privatise(100, debug=True)

[1, 0, 0, 0, 0, 0, 0, 1, 0, 0]
[1, 0, 0, 1, 0, 0, 0, 1, 1, 0]


([1, 0, 0, 1, 0, 0, 0, 1, 1, 0], 7)

([0, 1, 0, 0, 1, 0, 1, 0, 0, 1], 5)
