# CS 3110/5110: Data Privacy
## Homework 10

In [1]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs3110-data-privacy/raw/main/homework/adult_with_pii.csv')

## Question 1 (10 points)

Implement a function `dp_marginal` that calculates a differentially private one-way marginal for a given column of the adult dataset.

In [5]:
def dp_marginal(col, epsilon):
    
    # Get count of each value
    counts = adult[col].value_counts()
    
    # Apply laplace mechanism to all counts
    dp_syn_rep = counts.apply(laplace_mech, args=(1, epsilon))
    
    # Normalize to probabilities out of 1
    dp_syn_rep_nn = np.clip(dp_syn_rep, 0, None)
    syn_normalized = dp_syn_rep_nn / np.sum(dp_syn_rep_nn)
    
    return syn_normalized

dp_marginal('Occupation', 1.0)

Occupation
Prof-specialty       0.134782
Craft-repair         0.133452
Exec-managerial      0.132374
Adm-clerical         0.122793
Sales                0.118881
Other-service        0.107302
Machine-op-inspct    0.065194
Transport-moving     0.051805
Handlers-cleaners    0.044609
Farming-fishing      0.032362
Tech-support         0.030211
Protective-serv      0.021057
Priv-house-serv      0.004856
Armed-Forces         0.000322
Name: count, dtype: float64

In [6]:
# TEST CASE
marginal = dp_marginal('Age', 1.0)
assert marginal[36] > 0.02 and marginal[36] < 0.03
assert marginal[85] > 0.00005 and marginal[85] < 0.0005

marginal = dp_marginal('Occupation', 1.0)
assert marginal['Prof-specialty'] > 0.13 and marginal['Prof-specialty'] < 0.14
assert marginal['Protective-serv'] > 0.02 and marginal['Protective-serv'] < 0.03

## Question 2 (10 points)

Implement a function `dp_synthetic_data` that generates `n` samples of synthetic data for the given columns, by creating one-way marginals for *each column* and then sampling synthetic data elements for each column separately.

In [13]:
def dp_synthetic_data(cols, n, epsilon):
    df = pd.DataFrame()
    for col in cols:
        marg = dp_marginal(col, epsilon)
        syn = np.random.choice(marg.keys(), n,p=marg)
        df[col] = syn
    
    return df

dp_synthetic_data(['Age', 'Occupation', 'Marital Status', 'Education', 'Relationship'], 100, 1.0)

Unnamed: 0,Age,Occupation,Marital Status,Education,Relationship
0,34,Prof-specialty,Divorced,HS-grad,Unmarried
1,52,Machine-op-inspct,Never-married,HS-grad,Unmarried
2,27,Handlers-cleaners,Divorced,1st-4th,Unmarried
3,24,Craft-repair,Married-civ-spouse,Some-college,Husband
4,50,Adm-clerical,Never-married,HS-grad,Husband
...,...,...,...,...,...
95,33,Prof-specialty,Never-married,Some-college,Husband
96,58,Adm-clerical,Never-married,HS-grad,Not-in-family
97,38,Tech-support,Married-civ-spouse,Some-college,Own-child
98,24,Exec-managerial,Never-married,Some-college,Own-child


In [14]:
# TEST CASE
assert stats.wasserstein_distance(dp_synthetic_data(['Age'], len(adult), 1.0)['Age'], adult['Age']) < 0.2
assert stats.wasserstein_distance(dp_synthetic_data(['Education-Num'], len(adult), 1.0)['Education-Num'], 
                                  adult['Education-Num']) < 0.03

## Question 3 (10 points)

Implement a function `dp_two_marginal` that builds a 2-way marginal with differential privacy.

In [None]:
def dp_two_marginal(col1, col2, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

dp_two_marginal('Education', 'Workclass', 1.0)

In [None]:
# TEST CASE
marginal = dp_two_marginal('Education', 'Workclass', 1.0)
m1 = marginal[(marginal['Education'] == 'HS-grad') & (marginal['Workclass'] == 'Private')]['probability'].values[0]
m2 = marginal[(marginal['Education'] == 'Bachelors') & (marginal['Workclass'] == 'Federal-gov')]['probability'].values[0]
print(m1, m2)
assert m1 > 0.24 and m1 < 0.26
assert m2 > 0.005 and m2 < 0.007

## Question 4 (30 points)

Implement a function `dp_synthetic_data_two_marginal` that generates synthetic data for the `Age`, `Workclass`, `Occupation`, and `Education` columns *while preserving correlations between them* by using overlapping 2-way marginals.

In [None]:
def dp_synthetic_data_two_marginal(n, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

dp_synthetic_data_two_marginal(100, 1.0)

In [None]:
# TEST CASE
synthetic_data = dp_synthetic_data_two_marginal(100, 1.0)

s1 = synthetic_data['Age'].mean()
s2 = len(synthetic_data[synthetic_data['Workclass'] == 'Private'])
s3 = len(synthetic_data[synthetic_data['Occupation'] == 'Adm-clerical'])
s4 = len(synthetic_data[synthetic_data['Education'] == 'Bachelors'])

print(s1, s2, s3, s4)

assert s1 > 35 and s1 < 45
assert s2 > 65 and s2 < 90
assert s3 > 5 and s3 < 25
assert s4 > 5 and s4 < 35