# CS 3110 Final Project
## Noah Burnham
### A notebook that creates differentially-private synthetic data for a dataset about Portuguese wines. 

In [35]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0
data = pd.read_csv('https://github.com/nburnham23/DP-Synthetic-Data/raw/main/winequality-white.csv')
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [36]:
# Some important functions for generating synthetic data
def dp_marginal(col, epsilon):
    hist = data[col].value_counts()
    noisy_hist = hist.apply(lambda x: laplace_mech(x, sensitivity=1, epsilon=epsilon))
    marginal = hist.clip(lower=0) / hist.clip(lower=0).sum()
    return marginal
def gen_samples(n, marginal):
    sample = marginal.sample(n=n, replace=True, weights='probability')
    return sample.drop(columns='probability')
def dp_synthetic_data(cols, n, epsilon):
    # for each column, create a one-way marginal and sample n from it
    synth = {}
    epsilon_i = epsilon / len(cols)
    for col in cols:
        marginal = dp_marginal(col, epsilon_i).to_frame(name='probability')
        synth_i = gen_samples(n, marginal)
        synth[col] = list(synth_i.index)
    return pd.DataFrame(synth)
def dp_two_marginal(col1, col2, epsilon):
    hist = data[[col1, col2]].value_counts()
    noisy_hist = hist.apply(lambda x: laplace_mech(x, sensitivity=1, epsilon=epsilon)).clip(lower=0)
    marginal = noisy_hist / noisy_hist.sum()
    return marginal.to_frame(name='probability').reset_index()
def dp_synthetic_data_two_marginal(n, epsilon):
    def gen_conditional(s, m, cond, target):
        # limit m to the rows where cond == s
        limited = m[m[cond] == s]
        return limited.sample(n=1, weights='probability')[target].iloc[0]
    
    epsilon_i = epsilon / 4
    age_marginal = dp_marginal('Age', epsilon_i).to_frame(name='probability').reset_index()
    work_age_marginal = dp_two_marginal('Age', 'Workclass', epsilon_i)
    work_occ_marginal = dp_two_marginal('Workclass', 'Occupation', epsilon_i)
    occ_ed_marginal = dp_two_marginal('Occupation', 'Education', epsilon_i)
    
    samples = gen_samples(n, age_marginal)
    samples['Workclass'] = [gen_conditional(s, work_age_marginal, 'Age', 'Workclass') for s in samples['Age']]
    samples['Occupation'] = [gen_conditional(s, work_occ_marginal, 'Workclass', 'Occupation') for s in samples['Workclass']]
    samples['Education'] = [gen_conditional(s, occ_ed_marginal, 'Occupation', 'Education') for s in samples['Occupation']]
    return samples

First we can create a differentially-private one-way marginal for a column in the dataset, such as `quality`

In [435]:
# first create a differentially private one-way marginal for a column in the dataset
dp_marginal('quality', 1.0)

quality
6    0.448755
5    0.297468
7    0.179665
8    0.035729
4    0.033279
3    0.004083
9    0.001021
Name: count, dtype: float64

And using this one-way marginal, we can create synthetic data that **does not** retain correlations between columns of the dataset. Here we will create synthetic data for the columns `quality`, `volatile acidity`, `citric acid`, and `residual sugar`.

In [444]:
print(dp_synthetic_data(['quality', 'volatile acidity', 'citric acid', 'residual sugar'], 20, 1.0))
print(stats.wasserstein_distance(dp_synthetic_data(['quality'], len(data), 1.0)['quality'], data['quality']))

    quality  volatile acidity  citric acid  residual sugar
0         6              0.32         0.01             1.0
1         4              0.19         0.12            16.5
2         6              0.25         0.42            11.9
3         7              0.18         0.70             5.6
4         5              0.38         0.32            14.2
5         5              0.21         0.30            11.4
6         7              0.27         0.42             6.7
7         5              0.28         0.41             6.0
8         6              0.40         0.36             1.3
9         6              0.41         0.46             1.4
10        5              0.16         0.49             2.2
11        6              0.38         0.26            12.3
12        6              0.28         0.49             8.8
13        5              0.15         0.36             1.2
14        7              0.28         0.16             3.3
15        6              0.32         0.21             3

We can create a two-way marginal, and use this to build synthetic data where correlations between the data are preserved. 
For example, we can create a two-way marginal for the `quality` and `citric acid` columns of the dataset. 

In [445]:
dp_two_marginal('quality', 'citric acid', 1.0)

Unnamed: 0,quality,citric acid,probability
0,6,0.30,0.030516
1,6,0.28,0.028149
2,6,0.32,0.026167
3,6,0.29,0.022373
4,6,0.49,0.023375
...,...,...,...
318,5,0.78,0.000187
319,5,0.63,0.000433
320,5,0.60,0.000160
321,4,0.88,0.000000


And using a series of overlapping two-way marginals, we can create some synthetic data where correlations between columns
**are** preserved. 

In [160]:
def dp_synthetic_data_two_marginal(n, epsilon):
    def gen_conditional(s, m, cond, target):
        # limit m to the rows where cond == s
        limited = m[m[cond] == s]
        return limited.sample(n=1, weights='probability')[target].iloc[0]
    
    epsilon_i = epsilon / 12
    alc_marginal = dp_marginal('alcohol', epsilon_i).to_frame(name='probability').reset_index()
    alc_density_marg = dp_two_marginal('alcohol', 'density', epsilon_i)
    density_sugar_marg = dp_two_marginal('density', 'residual sugar', epsilon_i)
    sugar_citric_acid_marg = dp_two_marginal('residual sugar', 'citric acid', epsilon_i)
    citric_acid_vol_acidity_marg = dp_two_marginal('citric acid', 'volatile acidity', epsilon_i)
    vol_acidity_fixed_acidity_marg = dp_two_marginal('volatile acidity', 'fixed acidity', epsilon_i)
    fixed_acidity_ph_marg = dp_two_marginal('fixed acidity', 'pH', epsilon_i)
    ph_sulfate_marg = dp_two_marginal('pH', 'sulphates', epsilon_i)
    sulfate_chloride_marg = dp_two_marginal('sulphates', 'chlorides', epsilon_i)
    chloride_sulfur_marg = dp_two_marginal('chlorides', 'total sulfur dioxide', epsilon_i)
    total_free_sulfur_marg = dp_two_marginal('total sulfur dioxide', 'free sulfur dioxide', epsilon_i)
    free_sulfur_quality_marg = dp_two_marginal('free sulfur dioxide', 'quality', epsilon_i)

    samples = gen_samples(n, alc_marginal)
    samples['density'] = [gen_conditional(s, alc_density_marg, 'alcohol', 'density') for s in samples['alcohol']]
    samples['residual sugar'] = [gen_conditional(s, density_sugar_marg, 'density', 'residual sugar') for s in samples['density']]
    samples['citric acid'] = [gen_conditional(s, sugar_citric_acid_marg, 'residual sugar', 'citric acid') 
                              for s in samples['residual sugar']]
    samples['volatile acidity'] = [gen_conditional(s, citric_acid_vol_acidity_marg, 'citric acid', 'volatile acidity') 
                                   for s in samples['citric acid']]
    samples['fixed acidity'] = [gen_conditional(s, vol_acidity_fixed_acidity_marg, 'volatile acidity', 'fixed acidity') 
                                for s in samples['volatile acidity']]
    samples['pH'] = [gen_conditional(s, fixed_acidity_ph_marg, 'fixed acidity', 'pH') for s in samples['fixed acidity']]
    samples['sulphates'] = [gen_conditional(s, ph_sulfate_marg, 'pH', 'sulphates') for s in samples['pH']]
    samples['chlorides'] = [gen_conditional(s, sulfate_chloride_marg, 'sulphates', 'chlorides') for s in samples['sulphates']]
    samples['total sulfur dioxide'] = [gen_conditional(s, chloride_sulfur_marg, 'chlorides', 'total sulfur dioxide') 
                                      for s in samples['chlorides']]
    samples['free sulfur dioxide'] = [gen_conditional(s, total_free_sulfur_marg, 'total sulfur dioxide', 'free sulfur dioxide') 
                                      for s in samples['total sulfur dioxide']]
    samples['quality'] = [gen_conditional(s, free_sulfur_quality_marg, 'free sulfur dioxide', 'quality') 
                                      for s in samples['free sulfur dioxide']]
    
    return samples

Unnamed: 0,alcohol,density,residual sugar,citric acid,volatile acidity,fixed acidity,pH,sulphates,chlorides,total sulfur dioxide,free sulfur dioxide,quality
25,11.5,0.9928,1.6,0.68,0.2,5.9,3.38,0.54,0.049,173.0,49.0,5
16,10.1,0.99411,3.0,0.49,0.155,7.4,2.89,0.28,0.054,182.0,51.0,6
54,11.45,0.9902,1.2,0.38,0.22,8.4,3.03,0.34,0.037,160.0,9.0,4
1,9.5,0.9952,5.2,0.26,0.24,9.7,2.98,0.61,0.051,162.0,28.0,5
35,11.8,0.9911,1.45,0.39,0.25,5.1,3.22,0.38,0.076,130.0,39.0,8
31,8.7,0.99354,1.3,0.28,0.62,5.5,3.34,0.64,0.038,44.0,10.0,6
7,10.4,0.9934,3.3,0.29,0.25,6.0,3.32,0.55,0.038,131.0,23.0,5
23,10.7,0.99546,8.5,0.33,0.26,6.7,3.31,0.79,0.045,115.0,16.0,7
3,9.0,0.9978,11.9,0.22,0.3,7.5,3.24,0.61,0.028,69.0,24.0,6
25,11.5,0.9909,2.1,0.21,0.23,5.6,3.22,0.39,0.045,152.0,38.0,5


In [455]:
while True:
    try:
        synth = dp_synthetic_data_two_marginal(20, 1.0)
        break
    except ValueError:
        continue
print(stats.wasserstein_distance(synth['quality'], data['quality']))
synth

0.5494079216006532


Unnamed: 0,alcohol,density,residual sugar,citric acid,volatile acidity,fixed acidity,pH,sulphates,chlorides,total sulfur dioxide,free sulfur dioxide,quality
24,8.9,0.99745,13.0,0.36,0.24,6.6,3.32,0.41,0.027,104.0,22.0,6
40,12.1,0.9894,1.4,0.49,0.64,6.6,3.12,0.38,0.059,81.0,23.0,7
47,13.2,0.98836,1.4,0.49,0.47,7.0,3.08,0.48,0.025,107.0,20.0,6
47,13.2,0.991,1.7,0.27,0.32,6.7,3.32,0.38,0.152,120.0,17.0,7
22,11.3,0.9955,9.0,0.16,0.29,7.6,2.99,0.47,0.055,234.0,67.0,6
6,11.0,1.0001,18.15,0.49,0.15,7.2,2.98,0.65,0.037,123.0,32.0,6
42,12.9,0.9892,1.5,0.09,0.67,7.6,3.34,0.35,0.062,203.0,48.0,7
22,11.3,0.9925,6.8,0.26,0.26,8.9,3.1,0.76,0.035,80.0,29.0,8
7,10.4,0.99204,5.3,0.49,0.46,6.6,3.15,0.35,0.062,203.0,70.0,6
14,11.4,0.98988,4.2,0.45,0.24,6.6,3.41,0.64,0.037,108.0,25.0,7
