# CS 3110/5110: Data Privacy
## Homework 10

In [277]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs3110-data-privacy/raw/main/homework/adult_with_pii.csv')

## Question 1 (10 points)

Implement a function `dp_marginal` that calculates a differentially private one-way marginal for a given column of the adult dataset.

In [278]:
def dp_marginal(col, epsilon):
    
    # Get count of each value
    counts = adult[col].value_counts()
    
    # Apply laplace mechanism to all counts
    dp_syn_rep = counts.apply(laplace_mech, args=(1, epsilon))
    
    # Normalize to probabilities out of 1
    dp_syn_rep_nn = np.clip(dp_syn_rep, 0, None)
    syn_normalized = dp_syn_rep_nn / np.sum(dp_syn_rep_nn)
    
    return syn_normalized

dp_marginal('Occupation', 1.0)

Occupation
Prof-specialty       0.134803
Craft-repair         0.133452
Exec-managerial      0.132407
Adm-clerical         0.122736
Sales                0.118858
Other-service        0.107234
Machine-op-inspct    0.065102
Transport-moving     0.051951
Handlers-cleaners    0.044682
Farming-fishing      0.032339
Tech-support         0.030170
Protective-serv      0.021129
Priv-house-serv      0.004840
Armed-Forces         0.000297
Name: count, dtype: float64

In [279]:
# TEST CASE
marginal = dp_marginal('Age', 1.0)
assert marginal[36] > 0.02 and marginal[36] < 0.03
assert marginal[85] > 0.00005 and marginal[85] < 0.0005

marginal = dp_marginal('Occupation', 1.0)
assert marginal['Prof-specialty'] > 0.13 and marginal['Prof-specialty'] < 0.14
assert marginal['Protective-serv'] > 0.02 and marginal['Protective-serv'] < 0.03

## Question 2 (10 points)

Implement a function `dp_synthetic_data` that generates `n` samples of synthetic data for the given columns, by creating one-way marginals for *each column* and then sampling synthetic data elements for each column separately.

In [280]:
def dp_synthetic_data(cols, n, epsilon):
    # Empty DataFrame
    df = pd.DataFrame()
    
    for col in cols:
        # Get probabilities
        marg = dp_marginal(col, epsilon)
        
        # Generate synthetic data
        syn = np.random.choice(marg.keys(), n, p=marg)
        
        # Add to end of dataframe
        df[col] = syn
    
    return df

dp_synthetic_data(['Age', 'Occupation', 'Marital Status', 'Education', 'Relationship'], 100, 1.0)

Unnamed: 0,Age,Occupation,Marital Status,Education,Relationship
0,30,Exec-managerial,Married-civ-spouse,Some-college,Not-in-family
1,30,Exec-managerial,Divorced,HS-grad,Husband
2,54,Exec-managerial,Never-married,Some-college,Not-in-family
3,61,Craft-repair,Divorced,HS-grad,Husband
4,60,Craft-repair,Married-civ-spouse,HS-grad,Not-in-family
...,...,...,...,...,...
95,37,Protective-serv,Never-married,Bachelors,Husband
96,32,Machine-op-inspct,Married-civ-spouse,Some-college,Husband
97,74,Tech-support,Married-civ-spouse,7th-8th,Own-child
98,23,Sales,Never-married,HS-grad,Husband


In [281]:
# TEST CASE
assert stats.wasserstein_distance(dp_synthetic_data(['Age'], len(adult), 1.0)['Age'], adult['Age']) < 0.2
assert stats.wasserstein_distance(dp_synthetic_data(['Education-Num'], len(adult), 1.0)['Education-Num'], 
                                  adult['Education-Num']) < 0.03

## Question 3 (10 points)

Implement a function `dp_two_marginal` that builds a 2-way marginal with differential privacy.

In [282]:
def dp_two_marginal(col1, col2, epsilon):
    # Empty dataframe to hold everything
    df = pd.DataFrame()
    
    # Get crosstab
    ct = pd.crosstab(adult[col1], adult[col2])
    
    # Apply laplace mechanism to data in crosstab
    dp_ct = ct.applymap(lambda x: max(laplace_mech(x, 1, epsilon), 0))
    
    # Convert to list of lists
    dp_vals = dp_ct.stack().reset_index().values.tolist()
    
    # Add columns 1 and 2 to dataframe
    col1_data = [a for a,_,_ in dp_vals]
    col2_data = [b for _,b,_ in dp_vals]
    df[col1] = col1_data
    df[col2] = col2_data
    
    # Calculate probabilities and add to dataframe
    probs = [p for _,_,p in dp_vals]
    probs_norm = probs / np.sum(probs)
    df['probability'] = probs_norm
    
    return df

dp_two_marginal('Education', 'Workclass', 1.0)

Unnamed: 0,Education,Workclass,probability
0,10th,Federal-gov,0.000256
1,10th,Local-gov,0.001035
2,10th,Never-worked,0.000055
3,10th,Private,0.022595
4,10th,Self-emp-inc,0.000632
...,...,...,...
123,Some-college,Private,0.165765
124,Some-college,Self-emp-inc,0.007337
125,Some-college,Self-emp-not-inc,0.015699
126,Some-college,State-gov,0.010528


In [283]:
# TEST CASE
marginal = dp_two_marginal('Education', 'Workclass', 1.0)
m1 = marginal[(marginal['Education'] == 'HS-grad') & (marginal['Workclass'] == 'Private')]['probability'].values[0]
m2 = marginal[(marginal['Education'] == 'Bachelors') & (marginal['Workclass'] == 'Federal-gov')]['probability'].values[0]
print(m1, m2)
assert m1 > 0.24 and m1 < 0.26
assert m2 > 0.005 and m2 < 0.007

0.25326993130953535 0.006868410824122901


## Question 4 (30 points)

Implement a function `dp_synthetic_data_two_marginal` that generates synthetic data for the `Age`, `Workclass`, `Occupation`, and `Education` columns *while preserving correlations between them* by using overlapping 2-way marginals.

In [284]:
adult = adult.replace(np.nan, 'None')

def dp_synthetic_data_two_marginal(n, epsilon):
    
    cols = ['Age', 'Workclass', 'Occupation', 'Education']
    
    # Get marginals
    marg0 = dp_two_marginal(cols[0], cols[1], epsilon)
    marg1 = dp_two_marginal(cols[1], cols[2], epsilon)
    marg2 = dp_two_marginal(cols[2], cols[3], epsilon)
    
    # Sample initial data from marg0
    syn_df = marg0.sample(n, replace=True, weights=marg0['probability'])[[cols[0], cols[1]]]
    
    # Add Occupation data
    occupations = []
    
    for val in syn_df[cols[1]]:
        
        # Get data for relevant Workclass
        marg1_filter = marg1.loc[marg1[cols[1]] == val].copy()
        
        # Normalize filtered data
        marg1_filter['probability'] = marg1_filter['probability'] / marg1_filter['probability'].sum()
        
        dp_vals = list(marg1_filter.itertuples(index=False, name=None))
        
        probs = [p for _,_,p in dp_vals]
        vals = [b for _,b,_ in dp_vals]
        
        indices = range(0, len(vals))
        i = np.random.choice(indices, p=probs)
        occupations.append(vals[i])
        
    syn_df[cols[2]] = occupations
    
    # Add Education data
    educations = []
    
    for val in syn_df[cols[2]]:
        # Get data for relevant Occupation
        marg2_filter = marg2.loc[marg2[cols[2]] == val].copy()
        
        # Normalize filtered data
        marg2_filter['probability'] = marg2_filter['probability'] / marg2_filter['probability'].sum()
        
        dp_vals = list(marg2_filter.itertuples(index=False, name=None))
        
        probs = [p for _,_,p in dp_vals]
        vals = [b for _,b,_ in dp_vals]
        
        indices = range(0, len(vals))
        
        i = np.random.choice(indices, p=probs)
        educations.append(vals[i])
        
    syn_df[cols[3]] = educations
    
    return syn_df

dp_synthetic_data_two_marginal(100, 1.0)

Unnamed: 0,Age,Workclass,Occupation,Education
58,23,Private,Adm-clerical,HS-grad
211,40,Private,Transport-moving,HS-grad
166,35,Private,Adm-clerical,HS-grad
319,52,Private,Prof-specialty,Masters
148,33,Private,Handlers-cleaners,11th
...,...,...,...,...
211,40,Private,Other-service,HS-grad
40,21,Private,Adm-clerical,Some-college
202,39,Private,Prof-specialty,Prof-school
175,36,Private,Handlers-cleaners,11th


In [285]:
# TEST CASE
synthetic_data = dp_synthetic_data_two_marginal(100, 1.0)

s1 = synthetic_data['Age'].mean()
s2 = len(synthetic_data[synthetic_data['Workclass'] == 'Private'])
s3 = len(synthetic_data[synthetic_data['Occupation'] == 'Adm-clerical'])
s4 = len(synthetic_data[synthetic_data['Education'] == 'Bachelors'])

print(s1, s2, s3, s4)

assert s1 > 35 and s1 < 45
assert s2 > 65 and s2 < 90
assert s3 > 5 and s3 < 25
assert s4 > 5 and s4 < 35

38.31 68 13 13
