In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import io
import requests
import math

# load data
req = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data").content
adult = pd.read_csv(io.StringIO(req.decode('utf-8')), header=None, na_values='?', delimiter=r", ")
adult.dropna()
adult.head()

# the domains
domain = adult[8].dropna().unique()
domain.sort()
domain

  # This is added back by InteractiveShellApp.init_path()


array(['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other',
       'White'], dtype=object)

In [None]:
# the number of people of each race
adult_race = adult[8].dropna()
adult_race.value_counts().sort_index()

Amer-Indian-Eskimo      311
Asian-Pac-Islander     1039
Black                  3124
Other                   271
White                 27816
Name: 8, dtype: int64

In [None]:
# unary encoding - encoding
def encoding(answer):
    return [1 if d == answer else 0 for d in domain]

# test the encoding
print(encoding('Amer-Indian-Eskimo'))
print(encoding('Asian-Pac-Islander'))
print(encoding('Black'))
print(encoding('Other'))
print(encoding('White'))

[1, 0, 0, 0, 0]
[0, 1, 0, 0, 0]
[0, 0, 1, 0, 0]
[0, 0, 0, 1, 0]
[0, 0, 0, 0, 1]


In [None]:
# data encoding
answers = np.sum([encoding(r) for r in adult_race], axis=0)
list(zip(domain, answers))

[('Amer-Indian-Eskimo', 311),
 ('Asian-Pac-Islander', 1039),
 ('Black', 3124),
 ('Other', 271),
 ('White', 27816)]

In [None]:
# symmetric unary encoding - perturbation
def sym_perturbation(encoded_ans, epsilon = 5.0):
    return [sym_perturb_bit(b, epsilon) for b in encoded_ans]

def sym_perturb_bit(bit, epsilon = 5.0):
    p = pow(math.e, epsilon / 2) / (1 + pow(math.e, epsilon / 2))
    q = 1 - p

    s = np.random.random()
    if bit == 1:
        if s <= p:
            return 1
        else:
            return 0
    elif bit == 0:
        if s <= q:
            return 1
        else: 
            return 0

# test the perturbation, epsilon = 5.0
print(sym_perturbation(encoding('Amer-Indian-Eskimo')))
print(sym_perturbation(encoding('Asian-Pac-Islander')))
print(sym_perturbation(encoding('Black')))
print(sym_perturbation(encoding('Other')))
print(sym_perturbation(encoding('White')))
print()

# test the perturbation, epsilon = .1
print(sym_perturbation(encoding('Amer-Indian-Eskimo'), epsilon = .1))
print(sym_perturbation(encoding('Asian-Pac-Islander'), epsilon = .1))
print(sym_perturbation(encoding('Black'), epsilon = .1))
print(sym_perturbation(encoding('Other'), epsilon = .1))
print(sym_perturbation(encoding('White'), epsilon = .1))

[1, 0, 0, 0, 0]
[0, 1, 0, 0, 0]
[0, 0, 1, 0, 0]
[0, 0, 0, 1, 0]
[0, 0, 0, 0, 1]

[1, 1, 0, 0, 1]
[0, 1, 1, 0, 1]
[1, 0, 1, 0, 0]
[1, 0, 0, 0, 1]
[1, 0, 0, 0, 1]


In [None]:
# data perturbation
sym_perturbed_answers = np.sum([sym_perturbation(encoding(r)) for r in adult_race], axis=0)
list(zip(domain, sym_perturbed_answers))

[('Amer-Indian-Eskimo', 2851),
 ('Asian-Pac-Islander', 3269),
 ('Black', 5129),
 ('Other', 2590),
 ('White', 26063)]

In [None]:
# the actual number of people of each race
list(zip(domain, answers))

[('Amer-Indian-Eskimo', 311),
 ('Asian-Pac-Islander', 1039),
 ('Black', 3124),
 ('Other', 271),
 ('White', 27816)]

In [None]:
# symmetric unary encoding - aggregation and estimation
def sym_aggregation_and_estimation(answers, epsilon = 5.0):
    p = pow(math.e, epsilon / 2) / (1 + pow(math.e, epsilon / 2))
    q = 1 - p
    
    sums = np.sum(answers, axis=0)
    n = len(answers)
    
    return [int((i - n * q) / (p-q)) for i in sums]  

In [None]:
# data aggregation and estimation
sym_perturbed_answers = [sym_perturbation(encoding(r)) for r in adult_race]
estimated_answers = sym_aggregation_and_estimation(sym_perturbed_answers)
list(zip(domain, estimated_answers))

[('Amer-Indian-Eskimo', 215),
 ('Asian-Pac-Islander', 1082),
 ('Black', 3180),
 ('Other', 196),
 ('White', 27791)]

In [None]:
# the actual number of people of each race
list(zip(domain, answers))

[('Amer-Indian-Eskimo', 311),
 ('Asian-Pac-Islander', 1039),
 ('Black', 3124),
 ('Other', 271),
 ('White', 27816)]

In [None]:
################################################################################

In [None]:
# optimized unary encoding - perturbation
def opt_perturbation(encoded_ans, epsilon = 5.0):
    return [opt_perturb_bit(b, epsilon) for b in encoded_ans]

def opt_perturb_bit(bit, epsilon = 5.0):
    p = 1 / 2
    q = 1 / (1 + pow(math.e, epsilon))

    s = np.random.random()
    if bit == 1:
        if s <= p:
            return 1
        else:
            return 0
    elif bit == 0:
        if s <= q:
            return 1
        else: 
            return 0

# test the perturbation, epsilon = 5.0
print(opt_perturbation(encoding('Amer-Indian-Eskimo')))
print(opt_perturbation(encoding('Asian-Pac-Islander')))
print(opt_perturbation(encoding('Black')))
print(opt_perturbation(encoding('Other')))
print(opt_perturbation(encoding('White')))
print()

# test the perturbation, epsilon = .1
print(opt_perturbation(encoding('Amer-Indian-Eskimo'), epsilon = .1))
print(opt_perturbation(encoding('Asian-Pac-Islander'), epsilon = .1))
print(opt_perturbation(encoding('Black'), epsilon = .1))
print(opt_perturbation(encoding('Other'), epsilon = .1))
print(opt_perturbation(encoding('White'), epsilon = .1))

[0, 0, 0, 0, 0]
[0, 1, 0, 0, 0]
[0, 0, 1, 0, 0]
[0, 0, 0, 0, 1]
[0, 0, 0, 0, 0]

[1, 0, 0, 0, 1]
[0, 0, 1, 0, 1]
[1, 1, 0, 0, 1]
[0, 1, 1, 1, 1]
[1, 0, 1, 0, 0]


In [None]:
# data perturbation
opt_perturbed_answers = np.sum([opt_perturbation(encoding(r)) for r in adult_race], axis=0)
list(zip(domain, opt_perturbed_answers))

[('Amer-Indian-Eskimo', 397),
 ('Asian-Pac-Islander', 696),
 ('Black', 1796),
 ('Other', 386),
 ('White', 13976)]

In [None]:
# the actual number of people of each race
list(zip(domain, answers))

[('Amer-Indian-Eskimo', 311),
 ('Asian-Pac-Islander', 1039),
 ('Black', 3124),
 ('Other', 271),
 ('White', 27816)]

In [None]:
# optimized unary encoding - aggregation and estimation
def opt_aggregation_and_estimation(answers, epsilon = 5.0):
    p = 1 / 2
    q = 1 / (1 + pow(math.e, epsilon))
    
    sums = np.sum(answers, axis=0)
    n = len(answers)
    
    return [int((i - n * q) / (p-q)) for i in sums]  

In [None]:
# data aggregation and estimation
opt_perturbed_answers = [opt_perturbation(encoding(r)) for r in adult_race]
estimated_answers = opt_aggregation_and_estimation(opt_perturbed_answers)
list(zip(domain, estimated_answers))

[('Amer-Indian-Eskimo', 298),
 ('Asian-Pac-Islander', 1070),
 ('Black', 3107),
 ('Other', 283),
 ('White', 28215)]

In [None]:
# the actual number of people of each race
list(zip(domain, answers))

[('Amer-Indian-Eskimo', 311),
 ('Asian-Pac-Islander', 1039),
 ('Black', 3124),
 ('Other', 271),
 ('White', 27816)]