# CS 3110/5110: Data Privacy
## Homework 10

In [1]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs3110-data-privacy/raw/main/homework/adult_with_pii.csv')

## Question 1 (10 points)

Implement a function `dp_marginal` that calculates a differentially private one-way marginal for a given column of the adult dataset.

In [5]:
def dp_marginal(col, epsilon):
    
    # Get count of each value
    counts = adult[col].value_counts()
    
    # Apply laplace mechanism to all counts
    dp_syn_rep = counts.apply(laplace_mech, args=(1, epsilon))
    
    # Normalize to probabilities out of 1
    dp_syn_rep_nn = np.clip(dp_syn_rep, 0, None)
    syn_normalized = dp_syn_rep_nn / np.sum(dp_syn_rep_nn)
    
    return syn_normalized

dp_marginal('Occupation', 1.0)

Occupation
Prof-specialty       0.134782
Craft-repair         0.133452
Exec-managerial      0.132374
Adm-clerical         0.122793
Sales                0.118881
Other-service        0.107302
Machine-op-inspct    0.065194
Transport-moving     0.051805
Handlers-cleaners    0.044609
Farming-fishing      0.032362
Tech-support         0.030211
Protective-serv      0.021057
Priv-house-serv      0.004856
Armed-Forces         0.000322
Name: count, dtype: float64

In [6]:
# TEST CASE
marginal = dp_marginal('Age', 1.0)
assert marginal[36] > 0.02 and marginal[36] < 0.03
assert marginal[85] > 0.00005 and marginal[85] < 0.0005

marginal = dp_marginal('Occupation', 1.0)
assert marginal['Prof-specialty'] > 0.13 and marginal['Prof-specialty'] < 0.14
assert marginal['Protective-serv'] > 0.02 and marginal['Protective-serv'] < 0.03

## Question 2 (10 points)

Implement a function `dp_synthetic_data` that generates `n` samples of synthetic data for the given columns, by creating one-way marginals for *each column* and then sampling synthetic data elements for each column separately.

In [15]:
def dp_synthetic_data(cols, n, epsilon):
    # Empty DataFrame
    df = pd.DataFrame()
    
    for col in cols:
        # Get probabilities
        marg = dp_marginal(col, epsilon)
        
        # Generate synthetic data
        syn = np.random.choice(marg.keys(), n,p=marg)
        
        # Add to end of dataframe
        df[col] = syn
    
    return df

dp_synthetic_data(['Age', 'Occupation', 'Marital Status', 'Education', 'Relationship'], 100, 1.0)

Unnamed: 0,Age,Occupation,Marital Status,Education,Relationship
0,34,Transport-moving,Married-civ-spouse,7th-8th,Not-in-family
1,39,Tech-support,Married-spouse-absent,5th-6th,Not-in-family
2,36,Other-service,Married-civ-spouse,Some-college,Not-in-family
3,50,Adm-clerical,Married-civ-spouse,HS-grad,Not-in-family
4,25,Machine-op-inspct,Never-married,Prof-school,Husband
...,...,...,...,...,...
95,51,Prof-specialty,Married-civ-spouse,HS-grad,Husband
96,35,Adm-clerical,Never-married,10th,Not-in-family
97,22,Protective-serv,Married-civ-spouse,11th,Husband
98,21,Craft-repair,Married-civ-spouse,HS-grad,Husband


In [16]:
# TEST CASE
assert stats.wasserstein_distance(dp_synthetic_data(['Age'], len(adult), 1.0)['Age'], adult['Age']) < 0.2
assert stats.wasserstein_distance(dp_synthetic_data(['Education-Num'], len(adult), 1.0)['Education-Num'], 
                                  adult['Education-Num']) < 0.03

## Question 3 (10 points)

Implement a function `dp_two_marginal` that builds a 2-way marginal with differential privacy.

In [32]:
def dp_two_marginal(col1, col2, epsilon):
    # Empty dataframe to hold everything
    df = pd.DataFrame()
    
    # Get crosstab
    ct = pd.crosstab(adult[col1], adult[col2])
    
    # Apply laplace mechanism to data in crosstab
    dp_ct = ct.applymap(lambda x: max(laplace_mech(x, 1, epsilon), 0))
    
    # Convert to list of lists
    dp_vals = dp_ct.stack().reset_index().values.tolist()
    
    # Add columns 1 and 2 to dataframe
    col1_data = [a for a,_,_ in dp_vals]
    col2_data = [b for _,b,_ in dp_vals]
    df[col1] = col1_data
    df[col2] = col2_data
    
    # Calculate probabilities and add to dataframe
    probs = [p for _,_,p in dp_vals]
    probs_norm = probs / np.sum(probs)
    df['probability'] = probs_norm
    
    return df

dp_two_marginal('Education', 'Workclass', 1.0)

Unnamed: 0,Education,Workclass,probability
0,10th,Federal-gov,0.000234
1,10th,Local-gov,0.001028
2,10th,Never-worked,0.000085
3,10th,Private,0.022614
4,10th,Self-emp-inc,0.000629
...,...,...,...
123,Some-college,Private,0.165746
124,Some-college,Self-emp-inc,0.007347
125,Some-college,Self-emp-not-inc,0.015834
126,Some-college,State-gov,0.010577


In [33]:
# TEST CASE
marginal = dp_two_marginal('Education', 'Workclass', 1.0)
m1 = marginal[(marginal['Education'] == 'HS-grad') & (marginal['Workclass'] == 'Private')]['probability'].values[0]
m2 = marginal[(marginal['Education'] == 'Bachelors') & (marginal['Workclass'] == 'Federal-gov')]['probability'].values[0]
print(m1, m2)
assert m1 > 0.24 and m1 < 0.26
assert m2 > 0.005 and m2 < 0.007

0.2530754267939819 0.006868817972162109


## Question 4 (30 points)

Implement a function `dp_synthetic_data_two_marginal` that generates synthetic data for the `Age`, `Workclass`, `Occupation`, and `Education` columns *while preserving correlations between them* by using overlapping 2-way marginals.

In [None]:
def dp_synthetic_data_two_marginal(n, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

dp_synthetic_data_two_marginal(100, 1.0)

In [None]:
# TEST CASE
synthetic_data = dp_synthetic_data_two_marginal(100, 1.0)

s1 = synthetic_data['Age'].mean()
s2 = len(synthetic_data[synthetic_data['Workclass'] == 'Private'])
s3 = len(synthetic_data[synthetic_data['Occupation'] == 'Adm-clerical'])
s4 = len(synthetic_data[synthetic_data['Education'] == 'Bachelors'])

print(s1, s2, s3, s4)

assert s1 > 35 and s1 < 45
assert s2 > 65 and s2 < 90
assert s3 > 5 and s3 < 25
assert s4 > 5 and s4 < 35