## GSD 


In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import subprocess
import utils
from snsynth import Synthesizer
from snsynth.gsd import GSDSynthesizer
import time

from load_data import load_data
from sklearn.model_selection import train_test_split



For GSD support, please install jax: pip install --upgrade  "jax[cuda11_cudnn82]==0.4.6" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html


In [2]:
adult_path = 'adult.csv'
datasets = load_data(['adult'])
    
adult_df = datasets['adult']['data']

target =  datasets['adult']['target']
categorical_columns =  datasets['adult']['categorical_columns'].split(',')
print(adult_df.columns)
print(categorical_columns)


# Create config file. Note that we know the lower bound of each ordinal feature is 0.
# Let's assume the column upper bound is know.
ordinal_columns = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'fnlwgt']
continuous_columns = []
config = {}
for c in adult_df.columns:
    if c in categorical_columns:
        config[c] = {'type': 'string'}
    else:
        config[c] = {'type': 'int', 'lower': 0, 'upper': adult_df[c].max()}


# Split into train/test sets for machine learning evaluation.
adult_df_train, adult_df_test = train_test_split(adult_df, test_size=0.2)

loading downloaded_datasets/adult.csv
Memory consumed by adult:4167808
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'earning-class'],
      dtype='object')
['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'earning-class']


# Adult

In [None]:
# Still need to implement 
epsilon= 1.0
N_prime = 10000

synth = GSDSynthesizer(epsilon, 1e-5, tree_height=13, verbose=True)
synth.fit(adult_df_train,  meta_data=config,
          N_prime=N_prime)
    
max_error = np.abs(synth.stat_fn(synth.data.to_numpy()) - synth.stat_fn(synth.sync_data.to_numpy())).max()
print(f'Statistical error:', max_error)

adult_sync_df = synth.sample()

os.makedirs('downloaded_datasets', exist_ok=True)
adult_sync_df.to_csv(f'downloaded_datasets/adult_sync_{N_prime}_{epsilon:.2f}.csv')

privacy budgets: First moments = 0.001652. Second moments = 0.028905
Cond.Marginal= ['age'] . Sigma=0.0087. Top.Level=7. Max.Size=None
Cond.Marginal= ['fnlwgt'] . Sigma=0.0118. Top.Level=13. Max.Size=None
Cond.Marginal= ['education-num'] . Sigma=0.0073. Top.Level=5. Max.Size=None
Cond.Marginal= ['capital-gain'] . Sigma=0.0118. Top.Level=13. Max.Size=None
Cond.Marginal= ['capital-loss'] . Sigma=0.0118. Top.Level=13. Max.Size=None
Cond.Marginal= ['hours-per-week'] . Sigma=0.0087. Top.Level=7. Max.Size=None
	Total size=49716
       age.tree_height = 5. Thresholds=53
    fnlwgt.tree_height = 6. Thresholds=81
education-num.tree_height = 4. Thresholds=21
capital-gain.tree_height = 3. Thresholds=8
capital-loss.tree_height = 2. Thresholds=5
hours-per-week.tree_height = 4. Thresholds=30
Cond.Marginal= ['age', 'fnlwgt'] . Sigma=0.0080. Top.Level=6. Max.Size=None
Cond.Marginal= ['age', 'education-num'] . Sigma=0.0073. Top.Level=5. Max.Size=None
Cond.Marginal= ['age', 'capital-gain'] . Sigma=0.007

In [None]:


# adult_sync_df['Type'] = 'Sync'
# adult_df_copy = adult_df.sample(n=5000).copy()
# adult_df_copy['Type'] = 'Real'

# df = pd.concat([adult_sync_df, adult_df_copy])

# """
# Plot subgroups distributions:
# """
# g = sns.FacetGrid(data=df, col='Type',  hue='earning-class', sharey=False)
# g.map(sns.histplot, 'capital-gain')
# g.add_legend()
# plt.show()

In [8]:
data_path = 'downloaded_datasets/adult_sync_10.00.csv'

adult_sync_df = pd.read_csv(data_path, index_col=0)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.metrics import f1_score

categorical_features = categorical_columns.copy()
numeric_features = ordinal_columns.copy()
label = 'earning-class'
categorical_features.remove(label)


numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier())]
)

In [10]:
adult_df_train.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'earning-class'],
      dtype='object')

In [21]:
X_real = adult_df_train[categorical_features+numeric_features]
y_real = adult_df_train[label]


X_test = adult_df_test[categorical_features+numeric_features]
y_test = adult_df_test[label]


clf.fit(X_real, y_real)

print(f'Train on real:')
print(f"Accuracy={clf.score(X_test, y_test):.4f}")
print(f"F1-score = {f1_score(y_test, clf.predict(X_test), average='macro'):.4f}")

Train on real:
Accuracy=0.8715
F1-score = 0.8073


In [23]:
X_sync = adult_sync_df[categorical_features+numeric_features]
y_sync = adult_sync_df[label]

clf_sync = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier())]
)

clf_sync.fit(X_sync, y_sync)

print(f'Train on real:')
print(f"Accuracy={clf_sync.score(X_test, y_test):.4f}")
print(f"F1-score = {f1_score(y_test, clf_sync.predict(X_test), average='macro'):.4f}")

Train on real:
Accuracy=0.8592
F1-score = 0.7886


In [32]:
def sub_group(clf, X_test_arg, y_test_arg, group_feature):
    group_names = X_test_arg[group_feature].unique()

    for g in group_names:
        sub_ids = X_test_arg[group_feature] == g
        
        X_sub = X_test_arg[sub_ids]
        y_sub = y_test_arg[sub_ids]
        sz = len(X_sub)
        print(f"{group_feature}={g} with size={sz}:\tF1-score = {f1_score(y_sub, clf.predict(X_sub), average='macro'):.4f}")
        




In [37]:
protected_group = 'race'

print(f'Real data:')
sub_group(clf, X_test, y_test, protected_group)
print(f'Sync data:')

sub_group(clf_sync, X_test, y_test, protected_group)

Real data:
race=2 with size=626:	F1-score = 0.7118
race=4 with size=5563:	F1-score = 0.7729
race=1 with size=201:	F1-score = 0.7072
race=0 with size=57:	F1-score = 0.7808
race=3 with size=66:	F1-score = 0.8167
Sync data:
race=2 with size=626:	F1-score = 0.7327
race=4 with size=5563:	F1-score = 0.7910
race=1 with size=201:	F1-score = 0.7545
race=0 with size=57:	F1-score = 0.6984
race=3 with size=66:	F1-score = 0.7521


# End