In [26]:
import random
import math
import statistics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [27]:
dataset = pd.read_csv("adult.csv")
initial_no_rows = dataset.shape[0]
dataset = dataset.replace({'?':np.nan}).dropna()
clean_no_rows = dataset.shape[0]
print("Number of rows in the original dataset: {}".format(initial_no_rows))
print("Number of rows deleted: {}".format(initial_no_rows - clean_no_rows))
print("Number of rows in the clean dataset: {}".format(clean_no_rows))

Number of rows in the original dataset: 48842
Number of rows deleted: 3620
Number of rows in the clean dataset: 45222


In [28]:
dataset = dataset[dataset['income'].str.contains(">50K")]
dataset['education'].value_counts()

Bachelors       3178
HS-grad         2416
Some-college    1990
Masters         1393
Prof-school      592
Assoc-voc        504
Doctorate        399
Assoc-acdm       398
11th              89
10th              82
7th-8th           55
12th              43
9th               38
5th-6th           22
1st-4th            8
Preschool          1
Name: education, dtype: int64

In [29]:
class ExponentialMechanism:
    def __init__(self, dataset, epsilon=1):
        self.dataset = dataset
        self.sensitivity = 1 # Step 2
        self.epsilon = epsilon
        self.distribution = dict(zip(dataset['education'].value_counts().index.tolist(),list(dataset['education'].value_counts())))
        
    # Step 1
    def query(self, education_level):
        return self.distribution[education_level] / 20
    
    # Step 3
    def compute_all(self):
        query_results = []
        for education_level in self.distribution.keys():
            query_results.append(self.query(education_level))
            
        return query_results
    
    def get_denominator(self):
        denominator = 0
        
        for education_level in self.distribution.keys():
            denominator += math.exp((self.epsilon * self.query(education_level)) / (2 * self.sensitivity))
            
        return denominator
    
    def get_probabilities(self):
        probabilities = []
        
        for education_level in self.distribution.keys():
            probability = 0.0
            probability = math.exp((self.epsilon * self.query(education_level)) / (2 * self.sensitivity)) / self.get_denominator()
            probabilities.append(probability)
         
        return probabilities
        
    # Step 4
    def pick_r(self):
        probabilities = self.get_probabilities()
        
        return random.choices(list(self.distribution.values()), weights=probabilities)[0]
    
class LaplaceMechanism:
    def __init__(self, dataset, mu=0.0, epsilon=1):
        self.dataset = dataset
        self.sensitivity = 1 # step 2
        self.mu = mu
        self.scale = self.sensitivity / epsilon
        self.distribution = dict(zip(dataset['education'].value_counts().index.tolist(),list(dataset['education'].value_counts())))
        
    def query(self):
        return self.distribution['Bachelors']
    
    def draw_samples(self):
        return np.random.laplace(self.mu, self.scale)
    
    def get_noisy_data(self):
        return self.query() + self.draw_samples()

In [30]:
exponential_mechanism = ExponentialMechanism(dataset, epsilon=1)
print(exponential_mechanism.pick_r())

3178


In [34]:
epsilon_values = [0.0001, 0.001, 0.01, 0.1]

for epsilon in epsilon_values:
    returned_values = []
    for i in range(100):
        exponential_mechanism = ExponentialMechanism(dataset, epsilon=epsilon)
        returned = exponential_mechanism.pick_r()
        returned_values.append(returned)
    
    print(f'Epsilon: {epsilon}, Accuracy: {returned_values.count(3178) / len(returned_values)}')


Epsilon: 0.0001, Accuracy: 0.02
Epsilon: 0.001, Accuracy: 0.07
Epsilon: 0.01, Accuracy: 0.15
Epsilon: 0.1, Accuracy: 0.84


In [33]:
epsilon_values = [0.0001, 0.001, 0.01, 0.1]

for epsilon in epsilon_values:
    returned_values = []
    for i in range(100):
        exponential_mechanism = LaplaceMechanism(dataset, epsilon=epsilon)
        returned = exponential_mechanism.get_noisy_data()
        returned_values.append(returned)
        
    print(f'Epsilon: {epsilon}, Outcome: {sum(returned_values) / len(returned_values)}')

Epsilon: 0.0001, Outcome: 2846.7295687090455
Epsilon: 0.001, Outcome: 2951.067144789
Epsilon: 0.01, Outcome: 3162.6499558074925
Epsilon: 0.1, Outcome: 3179.91315869006
