In [1]:
import pandas as pd
import numpy as np

from importlib.machinery import SourceFileLoader
metrics = SourceFileLoader('dcr_nndr', '../metrics/privacy_metrics/dcr_nndr.py').load_module()

## AIDS

In [13]:
original = pd.read_csv('../datasets/AIDS/aids_original_data.csv', sep=';')
original = original.drop(["pidnum"], axis=1)
avatar = pd.read_csv('../datasets/AIDS/aids_avatarized_base_k20_nf5.csv')
synthpop = pd.read_csv('../datasets/AIDS/aids_synthpop_base.csv')
ctgan = pd.read_csv('../datasets/AIDS/aids_CTGAN_base_2.csv')

# Replace missing for metric computation
original = original.fillna(0)
avatar = avatar.fillna(0)
synthpop = synthpop.fillna(0)
ctgan = ctgan.fillna(0)

# Get correct data types
categorical_val, continuous_val = metrics.get_categorical_continuous(original)
original[categorical_val] = original[categorical_val].astype("category")
avatar[categorical_val] = avatar[categorical_val].astype("category")
synthpop[categorical_val] = synthpop[categorical_val].astype("category")
ctgan[categorical_val] = ctgan[categorical_val].astype("category")

# Add id column for metric computation
original['id'] = original.index
avatar['id'] = avatar.index
synthpop['id'] = synthpop.index
ctgan['id'] = ctgan.index

# Compute DCR and NNDR metrics
target = metrics.prepare_common_data_format(original,cat_columns= categorical_val,num_columns=continuous_val)
synthetic_avatar = metrics.prepare_common_data_format(avatar,cat_columns= categorical_val,num_columns=continuous_val)
synthetic_synthpop = metrics.prepare_common_data_format(synthpop,cat_columns= categorical_val,num_columns=continuous_val)
synthetic_ctgan = metrics.prepare_common_data_format(ctgan,cat_columns= categorical_val,num_columns=continuous_val)


status_avatar, results_avatar, df_avatar = metrics.compare(target, synthetic_avatar, metrics_to_return="privacy-tests")
status_synthpop, results_synthpop, df_synthpop = metrics.compare(target, synthetic_synthpop, metrics_to_return="privacy-tests")
status_ctgan, results_ctgan, df_ctgan = metrics.compare(target, synthetic_ctgan, metrics_to_return="privacy-tests")

# Gather results in df
## DCR
dcr_avatar = list(results_avatar['DCR']['details']['syn'])
dcr_synthpop = list(results_synthpop['DCR']['details']['syn'])
dcr_ctgan = list(results_ctgan['DCR']['details']['syn'])

dc = (dcr_avatar+dcr_synthpop+dcr_ctgan)
quantile =list(np.linspace(0.05, 0.5, 20))
x  = np.array(['Avatar', 'Synthpop', 'CTGAN'])
type = list(np.repeat(x, 20, axis=0))
quantile = (quantile+quantile+quantile)

df_dcr = pd.DataFrame(zip(quantile, dc, type), columns=['holdout_proportion', 'DCR', 'type'])

df_dcr.to_csv('../datasets/results_df/AIDS_DCR_comparison_results.csv',index=False)

## NNDR
nndr_avatar = list(results_avatar['NNDR']['details']['syn'])
nndr_synthpop = list(results_synthpop['NNDR']['details']['syn'])
nndr_ctgan = list(results_ctgan['NNDR']['details']['syn'])
nnd = (nndr_avatar+nndr_synthpop+nndr_ctgan)

df_nndr = pd.DataFrame(zip(quantile, nnd, type), columns=['holdout_proportion', 'NNDR', 'type'])

df_nndr.to_csv('../datasets/results_df/AIDS_NNDR_comparison_results.csv',index=False)


  tgt_data_p = tgt_data_p.reset_index().drop("id", 1)
  syn_data_p = syn_data_p.reset_index().drop("id", 1)
  tgt_data_p = tgt_data_p.reset_index().drop("id", 1)
  syn_data_p = syn_data_p.reset_index().drop("id", 1)
  tgt_data_p = tgt_data_p.reset_index().drop("id", 1)
  syn_data_p = syn_data_p.reset_index().drop("id", 1)


## WBCD

In [12]:
original = pd.read_csv('../datasets/WBCD/breast_cancer_wisconsin.csv')
original = original.drop(["Sample_code_number"], axis=1)
avatar = pd.read_csv('../datasets/WBCD/breast_cancer_wisconsin_avatarized_k20.csv')
synthpop = pd.read_csv('../datasets/WBCD/wbcd_synthpop_base.csv')
ctgan = pd.read_csv('../datasets/WBCD/wbcd_CTGAN_base_2.csv')

# Get correct data types
categorical_val, continuous_val = metrics.get_categorical_continuous(original)
original[categorical_val] = original[categorical_val].astype("category")
avatar[categorical_val] = avatar[categorical_val].astype("category")
synthpop[categorical_val] = synthpop[categorical_val].astype("category")
ctgan[categorical_val] = ctgan[categorical_val].astype("category")

# Add id column for metric computation
original['id'] = original.index
avatar['id'] = avatar.index
synthpop['id'] = synthpop.index
ctgan['id'] = ctgan.index

# Compute DCR and NNDR metrics
target = metrics.prepare_common_data_format(original,cat_columns= categorical_val,num_columns=continuous_val)
synthetic_avatar = metrics.prepare_common_data_format(avatar,cat_columns= categorical_val,num_columns=continuous_val)
synthetic_synthpop = metrics.prepare_common_data_format(synthpop,cat_columns= categorical_val,num_columns=continuous_val)
synthetic_ctgan = metrics.prepare_common_data_format(ctgan,cat_columns= categorical_val,num_columns=continuous_val)


status_avatar, results_avatar, df_avatar = metrics.compare(target, synthetic_avatar, metrics_to_return="privacy-tests")
status_synthpop, results_synthpop, df_synthpop = metrics.compare(target, synthetic_synthpop, metrics_to_return="privacy-tests")
status_ctgan, results_ctgan, df_ctgan = metrics.compare(target, synthetic_ctgan, metrics_to_return="privacy-tests")

# Gather results in df
## DCR
dcr_avatar = list(results_avatar['DCR']['details']['syn'])
dcr_synthpop = list(results_synthpop['DCR']['details']['syn'])
dcr_ctgan = list(results_ctgan['DCR']['details']['syn'])

dc = (dcr_avatar+dcr_synthpop+dcr_ctgan)
quantile =list(np.linspace(0.05, 0.5, 20))
x  = np.array(['Avatar', 'Synthpop', 'CTGAN'])
type = list(np.repeat(x, 20, axis=0))
quantile = (quantile+quantile+quantile)

df_dcr = pd.DataFrame(zip(quantile, dc, type), columns=['holdout_proportion', 'DCR', 'type'])

# df_dcr.to_csv('../datasets/results_df/WBCD_DCR_comparison_results.csv',index=False)

## NNDR
nndr_avatar = list(results_avatar['NNDR']['details']['syn'])
nndr_synthpop = list(results_synthpop['NNDR']['details']['syn'])
nndr_ctgan = list(results_ctgan['NNDR']['details']['syn'])
nnd = (nndr_avatar+nndr_synthpop+nndr_ctgan)

df_nndr = pd.DataFrame(zip(quantile, nnd, type), columns=['holdout_proportion', 'NNDR', 'type'])

# df_nndr.to_csv('../datasets/results_df/WBCD_NNDR_comparison_results.csv',index=False)


  tgt_data_p = tgt_data_p.reset_index().drop("id", 1)
  syn_data_p = syn_data_p.reset_index().drop("id", 1)
  tgt_data_p = tgt_data_p.reset_index().drop("id", 1)
  syn_data_p = syn_data_p.reset_index().drop("id", 1)
  tgt_data_p = tgt_data_p.reset_index().drop("id", 1)
  syn_data_p = syn_data_p.reset_index().drop("id", 1)


In [11]:
original

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class,id,sequence_pos
0,1000025,5,1,1,1,2,1,3,1,1,2,0,0
1,1002945,5,4,4,5,7,10,3,2,1,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,2,2,0
3,1016277,6,8,8,1,3,4,3,7,1,2,3,0
4,1017023,4,1,1,3,2,1,3,1,1,2,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
678,776715,3,1,1,1,3,2,1,1,1,2,678,0
679,841769,2,1,1,1,2,1,1,1,1,2,679,0
680,888820,5,10,10,3,7,3,8,10,2,4,680,0
681,897471,4,8,6,4,3,4,10,6,1,4,681,0
