In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import io
import requests
import math

# load data
req = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data").content
adult = pd.read_csv(io.StringIO(req.decode('utf-8')), header=None, na_values='?', delimiter=r", ")
adult.dropna()
adult.head()

# the domains
domain = adult[6].dropna().unique()
domain.sort()
domain

  # This is added back by InteractiveShellApp.init_path()


array(['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial',
       'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct',
       'Other-service', 'Priv-house-serv', 'Prof-specialty',
       'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'],
      dtype=object)

In [None]:
# the number of people of each occupation
adult_occupation = adult[6].dropna()
adult_occupation.value_counts().sort_index()

Adm-clerical         3770
Armed-Forces            9
Craft-repair         4099
Exec-managerial      4066
Farming-fishing       994
Handlers-cleaners    1370
Machine-op-inspct    2002
Other-service        3295
Priv-house-serv       149
Prof-specialty       4140
Protective-serv       649
Sales                3650
Tech-support          928
Transport-moving     1597
Name: 6, dtype: int64

In [None]:
# direct encoding - encoding
def encoding(answer):
    return int(np.where(domain == answer)[0])

# test the encoding
print(encoding('Armed-Forces'))
print(encoding('Craft-repair'))
print(encoding('Sales'))
print(encoding('Transport-moving'))

1
2
11
13


In [None]:
# direct encoding - perturbation
def perturbation(encoded_ans, epsilon = 5.0):
    # size of the domain set
    d = len(domain)
    p = pow(math.e, epsilon) / (d - 1 + pow(math.e, epsilon))
    q = (1.0 - p) / (d - 1.0)
    s1 = np.random.random()
    if s1 <= p:
        # return itself with probability p
        return domain[encoded_ans]
    else:
        s2 = np.random.randint(0, d - 1)
        return domain[(encoded_ans + s2) % d]

d = len(domain)
# test the perturbation, epsilon = 5.0
print(perturbation(encoding('Armed-Forces')))
print(perturbation(encoding('Craft-repair')))
print(perturbation(encoding('Sales')))
print(perturbation(encoding('Transport-moving')))
p = pow(math.e, 5.0) / (d - 1 + pow(math.e, 5.0))
q = (1.0 - p) / (d - 1.0)
print("epsilon = 5.0 "+"p: "+str(p)+" q: "+str(q))
print()

# test the perturbation, epsilon = .1
print(perturbation(encoding('Armed-Forces'), epsilon = .1))
print(perturbation(encoding('Craft-repair'), epsilon = .1))
print(perturbation(encoding('Sales'), epsilon = .1))
print(perturbation(encoding('Transport-moving'), epsilon = .1))
p = pow(math.e, .1) / (d - 1 + pow(math.e, .1))
q = (1.0 - p) / (d - 1.0)
print("epsilon = 0.1 "+"p: "+str(p)+" q: "+str(q))
print()

Armed-Forces
Craft-repair
Sales
Transport-moving
epsilon = 5.0 p: 0.9194613371531957 q: 0.006195281757446487

Craft-repair
Exec-managerial
Armed-Forces
Transport-moving
epsilon = 0.1 p: 0.07835218194055213 q: 0.0708959860045729



In [None]:
# data perturbation
perturbed_answers = pd.DataFrame([perturbation(encoding(i)) for i in adult_occupation])
perturbed_answers.value_counts().sort_index()

Adm-clerical         3639
Armed-Forces          183
Craft-repair         3934
Exec-managerial      3932
Farming-fishing      1110
Handlers-cleaners    1444
Machine-op-inspct    1998
Other-service        3217
Priv-house-serv       297
Prof-specialty       4017
Protective-serv       765
Sales                3540
Tech-support         1022
Transport-moving     1620
dtype: int64

In [None]:
# the number of people of each occupation
adult_occupation = adult[6].dropna()
adult_occupation.value_counts().sort_index()

Adm-clerical         3770
Armed-Forces            9
Craft-repair         4099
Exec-managerial      4066
Farming-fishing       994
Handlers-cleaners    1370
Machine-op-inspct    2002
Other-service        3295
Priv-house-serv       149
Prof-specialty       4140
Protective-serv       649
Sales                3650
Tech-support          928
Transport-moving     1597
Name: 6, dtype: int64

In [None]:
# direct encoding - aggregation and estimation
def aggregation_and_estimation(answers, epsilon = 5.0):
    n = len(answers)
    d = len(domain)
    p = pow(math.e, epsilon) / (d - 1 + pow(math.e, epsilon))
    q = (1.0 - p) / (d - 1.0)

    aggregator = answers.value_counts().sort_index()
    
    return [int((i - n*q) / (p-q)) for i in aggregator]  

In [None]:
# data aggregation and estimation
estimated_answers = aggregation_and_estimation(perturbed_answers)
list(zip(domain, estimated_answers))

[('Adm-clerical', 3776),
 ('Armed-Forces', -8),
 ('Craft-repair', 4099),
 ('Exec-managerial', 4097),
 ('Farming-fishing', 1007),
 ('Handlers-cleaners', 1372),
 ('Machine-op-inspct', 1979),
 ('Other-service', 3314),
 ('Priv-house-serv', 116),
 ('Prof-specialty', 4190),
 ('Protective-serv', 629),
 ('Sales', 3667),
 ('Tech-support', 910),
 ('Transport-moving', 1565)]

In [None]:
# the number of people of each occupation
adult_occupation = adult[6].dropna()
adult_occupation.value_counts().sort_index()

Adm-clerical         3770
Armed-Forces            9
Craft-repair         4099
Exec-managerial      4066
Farming-fishing       994
Handlers-cleaners    1370
Machine-op-inspct    2002
Other-service        3295
Priv-house-serv       149
Prof-specialty       4140
Protective-serv       649
Sales                3650
Tech-support          928
Transport-moving     1597
Name: 6, dtype: int64