## Comparing GSD and AIM


In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import subprocess
import utils
from snsynth import Synthesizer
from snsynth.gsd import GSDSynthesizer
from snsynth.aim import AIMSynthesizer
from load_data import load_data
from sklearn.model_selection import train_test_split

import time
import yaml

For GSD support, please install jax: pip install --upgrade  "jax[cuda11_cudnn82]==0.4.6" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html


# Adult (Categorical)

In [2]:
adult_path = 'adult.csv'
datasets = load_data(['adult'])
    
adult_df = datasets['adult']['data']

target =  datasets['adult']['target']
categorical_columns =  datasets['adult']['categorical_columns'].split(',')

adult_df = adult_df[categorical_columns]
print(adult_df.columns)


adult_df_train, adult_df_test = train_test_split(adult_df, test_size=0.2)

loading downloaded_datasets/adult.csv
Memory consumed by adult:4167808
Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'earning-class'],
      dtype='object')


AIM

In [3]:
t0 = time.time()
aim = AIMSynthesizer(epsilon=1.0, delta=1e-9, verbose=True)
aim.fit(adult_df_train, categorical_columns=categorical_columns)
print(f'elapsed time= {time.time() - t0:.3f}')
aim_adult_df = aim.sample()

Fitting with 76204800 dimensions
9
Initial Sigma 73.09534912965321
Selected ('col2', 'col4') Size 42 Budget Used 0.06319444444444443
Selected ('col4', 'col6') Size 12 Budget Used 0.07013888888888886
Selected ('col2', 'col8') Size 14 Budget Used 0.07708333333333331
Selected ('col3', 'col6') Size 30 Budget Used 0.08402777777777774
Selected ('col3', 'col8') Size 30 Budget Used 0.09097222222222219
Selected ('col1', 'col8') Size 32 Budget Used 0.09791666666666662
Selected ('col0', 'col3') Size 135 Budget Used 0.10486111111111107
Selected ('col5', 'col6') Size 10 Budget Used 0.1118055555555555
Selected ('col0', 'col6') Size 18 Budget Used 0.11874999999999994
Selected ('col5', 'col8') Size 10 Budget Used 0.1256944444444444
Selected ('col2', 'col6') Size 14 Budget Used 0.13263888888888883
Selected ('col4', 'col8') Size 12 Budget Used 0.13958333333333328
Selected ('col6', 'col8') Size 4 Budget Used 0.14652777777777776
Selected ('col1', 'col6') Size 32 Budget Used 0.1534722222222222
Selected ('c

 GSD

In [4]:
# Run GSD

t0 = time.time()
gsd = GSDSynthesizer(epsilon=1.0, delta=1e-9, verbose=True)
gsd.fit(adult_df_train, N_prime=5000, categorical_columns=categorical_columns)
print(f'elapsed time= {time.time() - t0:.3f}')
gsd_adult_df = gsd.sample()

privacy budgets: Second moments = 0.014973
Cond.Marginal= ['workclass', 'education'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['workclass', 'marital-status'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['workclass', 'occupation'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['workclass', 'relationship'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['workclass', 'race'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['workclass', 'sex'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['workclass', 'native-country'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['workclass', 'earning-class'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['education', 'marital-status'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['education', 'occupation'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.Marginal= ['education', 'relationship'] . Sigma=0.0027. Top.Level=1. Max.Size=None
Cond.M

In [5]:
import time
rng = np.random.default_rng(0)

d = 1
ordinal_columns = 'categorical_1'
cat_cardinality = 10000
N = 2 * cat_cardinality

data_np = rng.integers(0, cat_cardinality, (N, d ))
data_np[:cat_cardinality, 0] = 0
data_np = data_np.astype(str)
data_df = pd.DataFrame(data_np, columns=[ordinal_columns])


t0 = time.time()
synth = GSDSynthesizer(3000.0, 1e-5, verbose=True)
# Since we are passing the data bounds, we do not need to provide privacy budget for preprocessing.
synth.fit(data_np, N_prime=1000, genetic_operators=['mutate', 'cross'])
print(f'elapsed time = {time.time() - t0}')

error = np.abs(synth.stat_fn(synth.data.to_numpy()) - synth.stat_fn(synth.sync_data.to_numpy()))
print(f'epsilon={epsilon}', f'Statistical error: max={error.max():.4f}, Avg={error.mean():.4f}')

Cond.Marginal= [0] . Sigma=0.0000. Top.Level=1. Max.Size=None
	Total size=6328
Statistics size = 6328
Gen=      1000: fitness=0.00893408622. Strategy wins:  [0.426 0.574] time=3.94 (s)
Gen=      2000: fitness=0.000379580813. Strategy wins:  [1. 0.] time=5.20 (s)
Gen=      3000: fitness=0.000377139804. Strategy wins:  [1. 0.] time=6.30 (s)
Gen=      4000: fitness=0.000376495727. Strategy wins:  [1. 0.] time=7.43 (s)
Gen=      5000: fitness=0.000376359241. Strategy wins:  [1. 0.] time=8.52 (s)
Gen=      6000: fitness=0.000376350038. Strategy wins:  [1. 0.] time=9.60 (s)
		 ### Stop early at 6000 ###
elapsed time = 15.244399785995483


NameError: name 'epsilon' is not defined

In [None]:
# Run this cell to observe slow converge behavior without the cross genetic operator.
t0 = time.time()
synth = GSDSynthesizer(3000.0, 1e-5, verbose=True)
# Since we are passing the data bounds, we do not need to provide privacy budget for preprocessing.
synth.fit(data_df, N_prime=1000, genetic_operators=['mutate'])
print(f'elapsed time = {time.time() - t0}')

In [None]:
rng = np.random.default_rng(1)

d = 1
columns = ['categorical_1', 'categorical_2']
cat_cardinality = 11
N = 200
values = rng.integers(1, cat_cardinality, (N, d ))
values[:190] = 0
data_np = np.column_stack((values, values)).astype(str)
correlated_cat_df = pd.DataFrame(data_np, columns=columns)

t0 = time.time()
synth = GSDSynthesizer(10000000.0, 1e-5, verbose=True)
# Since we are passing the data bounds, we do not need to provide privacy budget for preprocessing.
synth.fit(correlated_cat_df, N_prime=200, genetic_operators=['mutate', 'cross', 'swap'])
print(f'elapsed time = {time.time() - t0}')

error = np.abs(synth.stat_fn(synth.data.to_numpy()) - synth.stat_fn(synth.sync_data.to_numpy()))
print(f'epsilon={epsilon}', f'Statistical error: max={error.max():.4f}, Avg={error.mean():.4f}')

In [None]:
## Run this cell to check slow convergence without the cross operator
t0 = time.time()
synth = GSDSynthesizer(10000000.0, 1e-5, verbose=True)
# Since we are passing the data bounds, we do not need to provide privacy budget for preprocessing.
synth.fit(correlated_cat_df, N_prime=200, genetic_operators=['mutate', 'cross'])
print(f'elapsed time = {time.time() - t0}')

error = np.abs(synth.stat_fn(synth.data.to_numpy()) - synth.stat_fn(synth.sync_data.to_numpy()))
print(f'epsilon={epsilon}', f'Statistical error: max={error.max():.4f}, Avg={error.mean():.4f}')

In [None]:
"""
3-d correlated categorical data.
"""

rng = np.random.default_rng(0)

d = 1
columns = ['categorical_1', 'categorical_2', 'categorical_3']
cat_cardinality = 11
N = 200
values = rng.integers(1, cat_cardinality, (N, d ))
values[:190] = 0
data_np = np.column_stack((values, values, values)).astype(str)
correlated_cat_df = pd.DataFrame(data_np, columns=columns)

t0 = time.time()
synth = GSDSynthesizer(10000000.0, 1e-5, verbose=True)
# Since we are passing the data bounds, we do not need to provide privacy budget for preprocessing.
synth.fit(correlated_cat_df, N_prime=200, genetic_operators=['mutate', 'cross'])
print(f'elapsed time = {time.time() - t0}')

error = np.abs(synth.stat_fn(synth.data.to_numpy()) - synth.stat_fn(synth.sync_data.to_numpy()))
print(f'epsilon={epsilon}', f'Statistical error: max={error.max():.4f}, Avg={error.mean():.4f}')

# ML evaluation



We follow the train on synthetic, test on real approach:

- Given two sets of real data: A train set and ad test set.
- Generate synthetic data using the train set.
- Then we train a ML model using the synthetic data.
- Finally, we validate the model's performance using the holdout real data.

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.metrics import f1_score

categorical_features = categorical_columns.copy()
label = 'earning-class'
categorical_features.remove(label)


categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        # ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier())]
)

In [43]:
X_real = adult_df_train[categorical_features]
y_real = adult_df_train[label]


X_test = adult_df_test[categorical_features]
y_test = adult_df_test[label]

clf.fit(X_real, y_real)

print(f'Train on real:')
print(f"Accuracy={clf.score(X_test, y_test):.4f}")
print(f"F1-score = {f1_score(y_test, clf.predict(X_test), average='macro'):.4f}")

Train on real:
Accuracy=0.8349
F1-score = 0.7528


In [42]:
X_aim = aim_adult_df[categorical_features]
y_aim = aim_adult_df[label]

clf.fit(X_aim, y_aim)


print(f'Train on AIM synthetic data:')
print(f"Accuracy = {clf.score(X_test, y_test):.4f}")
print(f"F1-score = {f1_score(y_test, clf.predict(X_test), average='macro'):.4f}")

Train on AIM synthetic data:
Accuracy = 0.8303
F1-score = 0.7461


In [41]:
X_gsd = gsd_adult_df[categorical_features]
y_gsd = gsd_adult_df[label]

clf.fit(X_gsd, y_gsd)

print(f'Train on GSD synthetic data:')
print(f"Accuracy = {clf.score(X_test, y_test):.4f}")
print(f"F1-score = {f1_score(y_test, clf.predict(X_test), average='macro'):.4f}")

Train on GSD synthetic data:
Accuracy = 0.8349
F1-score = 0.7494


In [None]:
import matplotlib.pyplot as plt
rng = np.random.default_rng(0)

collect = []

mean = [0.5, 0.5]
cov = [[.01, 0.0099], 
       [0.0099, .01]]  # diagonal covariance

N = 200

x, y = np.random.multivariate_normal(mean, cov, N).T
values_cont = np.column_stack((x, y))
values_cont[100:300, 0] = 0.03
values_cont[100:300, 1] = 0.93

cont_cols = ['c1', 'c2']
data_cont_df = pd.DataFrame(values_cont, columns=cont_cols)
meta_data = {'c1': {'type': 'float', 'lower': 0, 'upper': 1}, 'c2': {'type': 'float', 'lower': 0, 'upper': 1}}

plot_data = data_cont_df.copy()
plot_data.loc[:, 'Type'] = 'Original'
collect.append(plot_data)


for epsilon in [1000, 100]:
    print(f'epsilon={epsilon}')
    synth = GSDSynthesizer(float(epsilon), 1e-5, tree_height=12, verbose=True)
    synth.fit(data_cont_df, meta_data=meta_data, 
              genetic_operators=['mutate', 'continuous'])
    error = np.abs(synth.stat_fn(synth.data.to_numpy()) - synth.stat_fn(synth.sync_data.to_numpy()))
    print(f'epsilon={epsilon}', f'Statistical error: max={error.max():.4f}, Avg={error.mean():.4f}')

    sync_df = synth.sample()
    sync_df.loc[:, 'Type'] = f'eps={epsilon}'
    collect.append(sync_df)

    synth = GSDSynthesizer(float(epsilon), 1e-5, tree_height=12, verbose=True)
    # Since we are passing the data bounds, we do not need to provide privacy budget for preprocessing.
    synth.fit(data_cont_df, meta_data=meta_data, 
              genetic_operators=['mutate', 'continuous', 'swap'])
    error = np.abs(synth.stat_fn(synth.data.to_numpy()) - synth.stat_fn(synth.sync_data.to_numpy()))
    print(f'epsilon={epsilon}', f'Statistical error: max={error.max():.4f}, Avg={error.mean():.4f}')
    sync_df = synth.sample()
    sync_df.loc[:, 'Type'] = f'eps={epsilon}/swap'
    collect.append(sync_df)



In [None]:
all_data = pd.concat(collect)
g= sns.FacetGrid(data=all_data, col='Type', sharey=True,sharex=True)
g.map(sns.scatterplot, "c1", "c2", alpha=0.1)
plt.show()

# End