Data Privacy Final Project 

In [9]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(vec, sensitivity, epsilon):
    return [v + np.random.laplace(loc=0, scale=sensitivity / epsilon) for v in vec]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# preserves epsilon-differential privacy
def above_threshold(query_results, T, epsilon):
    T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)
    
    for idx, q in enumerate(query_results):
        nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
        if q + nu_i >= T_hat:
            return idx
    return None

math = pd.read_csv("student_math_clean.csv")
math.head(5)

Unnamed: 0,student_id,school,sex,age,address_type,family_size,parent_status,mother_education,father_education,mother_job,...,family_relationship,free_time,social,weekday_alcohol,weekend_alcohol,health,absences,grade_1,grade_2,final_grade
0,1,GP,F,18,Urban,Greater than 3,Apart,higher education,higher education,at_home,...,4,3,4,1,1,3,6,5,6,6
1,2,GP,F,17,Urban,Greater than 3,Living together,primary education (4th grade),primary education (4th grade),at_home,...,5,3,3,1,1,3,4,5,5,6
2,3,GP,F,15,Urban,Less than or equal to 3,Living together,primary education (4th grade),primary education (4th grade),at_home,...,4,3,2,2,3,3,10,7,8,10
3,4,GP,F,15,Urban,Greater than 3,Living together,higher education,5th to 9th grade,health,...,3,2,2,1,1,5,2,15,14,15
4,5,GP,F,16,Urban,Greater than 3,Living together,secondary education,secondary education,other,...,4,3,2,1,2,5,4,6,10,10


In [10]:
## Cache the sorted ages, because we will use them a lot.
absences_lower = 0
absences_upper = 20
sorted_absences = math['absences'].clip(lower=absences_lower, upper=absences_upper).sort_values()

#what is the upper bound of the data - b

# we wrote a function that found b - to know where to clip 
# privataize that clipped idea data under that 
# by apply gauss and laplace

def min_absences():
    clipped_absences = math['absences'].clip(lower=0, upper=100)
    return clipped_absences.min()

def max_absences():
    clipped_absences = math['absences'].clip(lower=0, upper=100)
    return clipped_absences.max()

def ls_min():
    return max(sorted_absences.iloc[0] - absences_lower, sorted_absences.iloc[1] - sorted_absences.iloc[0])

print('Actual minimum absences:', min_absences())
print('Local sensitivity of the minimum:', ls_min())
print('max', max_absences())


Actual minimum absences: 0
Local sensitivity of the minimum: 0
max 75


In [15]:
bs = list(range(0, 75, 5)) #(min, max, step)
df = math['absences']

#find upper cliping param for mean_abscences 
def upper(b_lower, epsilon):
        query_results = [df.clip(lower=b_lower, upper=b).sum() - \
                         df.clip(lower=b_lower, upper=b+1).sum() for b in bs]
        idx = above_threshold(query_results, 0, epsilon)
        
        if idx == None:
            return bs[-1]
        else:
            return bs[idx]

In [14]:
# finds upper and lower, and adds laplace differenial noise
def mean_abscence(epsilon):
    b_lower = 0
    b_upper = upper(b_lower, epsilon / 3) #finds upper clipping param
    
    noisy_sum = laplace_mech(df.clip(lower=b_lower, upper=b_upper).sum(), np.abs(b_upper - b_lower), epsilon / 3)
    noisy_count = laplace_mech(len(df), 1, epsilon / 3)
    
    return noisy_sum / noisy_count

    
for epsilon in [0.001, 0.01, 0.1, 0.5, 1, 10]:
    print(f"epsilon: {epsilon}, mean abscence: {mean_abscence(epsilon)}")

epsilon: 0.001, mean abscence: 0.0
epsilon: 0.01, mean abscence: -0.0
epsilon: 0.1, mean abscence: 2.664035040274383
epsilon: 0.5, mean abscence: 3.2791534273755074
epsilon: 1, mean abscence: 5.385933283743881
epsilon: 10, mean abscence: 5.656371253991569


In [None]:
#run gaussian by setting an arbitray delta value .0001 could be fine 


In [None]:
#graph and compare the two

In [6]:
# not here!
# preserves epsilon-differential privacy
def above_threshold(query_results, T, epsilon):
    T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)
    
    for idx, q in enumerate(query_results):
        nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
        if q + nu_i >= T_hat:
            return idx
    return None
def above_15(query_results, epsilon):
    ### BEGIN SOLUTION
    T = 15
    idx = above_threshold(query_results, T, epsilon/2)
    val = laplace_mech(query_results[idx], 1, epsilon/2)
    return val
    ### END SOLUTION

queries = math['absences'].value_counts()
print(f"above_15 #1: {above_15(queries, 100)}")
print(f"above_15 #2: {above_15(queries, 1)}")
print(f"above_15 #3: {above_15(queries, .01)}")

above_15 #1: 114.99887956057044
above_15 #2: 111.90112217102819
above_15 #3: 30.240166167322588


In [12]:
def bounded_all_above_10000(query_results, c, epsilon):
    ### BEGIN SOLUTION
    answers = []
    pos = 0
    T = 15
    epsilon_i = epsilon / (c*2)
    
    while len(answers) < c:
        answer_idx = above_threshold(query_results[pos:], T, epsilon_i)
        if answer_idx == None:
            break
        pos = pos + answer_idx
        answers.append(laplace_mech(query_results[pos], 1, epsilon_i))
        pos = pos + 1
        
    return answers
    ### END SOLUTION

queries = list(math['absences'].value_counts())
print(f"bounded_all_above_10000 #1: {bounded_all_above_15(queries, 3, 100)}")
print(f"bounded_all_above_10000 #2: {bounded_all_above_15(queries, 3, 1)}")
print(f"bounded_all_above_10000 #3: {bounded_all_above_10000(queries, 3, .01)}")

NameError: name 'bounded_all_above_15' is not defined