# Environment

In [1]:
import os
import pprint
import sys

path_petsard = os.path.dirname(os.path.dirname(os.getcwd()))
print(path_petsard)
sys.path.append(path_petsard)

pp = pprint.PrettyPrinter(depth=2)

# Functional Test: import PETsARD

In [2]:
import PETsARD

ImportError: cannot import name 'Reporter' from 'PETsARD.reporter' (unknown location)

# Functional Test: Module-by-Module

## Loader

In [None]:
from PETsARD import Loader


load = Loader(
    filepath='benchmark://adult',
    na_values={k: '?' for k in [
        'workclass',
        'occupation',
        'native-country'
    ]}
)
load.load()
print(load.data.head(1))

Loader - Benchmarker: file benchmark\adult.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
   age workclass  fnlwgt education  educational-num marital-status  \
0   25   Private  226802      11th                7  Never-married   

          occupation relationship   race gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black   Male             0             0   

   hours-per-week native-country income  
0              40  United-States  <=50K  


## metadata

In [None]:
pp.pprint(load.metadata.metadata)

{'col': {'age': {...},
         'capital-gain': {...},
         'capital-loss': {...},
         'education': {...},
         'educational-num': {...},
         'fnlwgt': {...},
         'gender': {...},
         'hours-per-week': {...},
         'income': {...},
         'marital-status': {...},
         'native-country': {...},
         'occupation': {...},
         'race': {...},
         'relationship': {...},
         'workclass': {...}},
 'global': {'col_num': 15,
            'na_percentage': 0.07411653904426518,
            'row_num': 48842}}


## Splitter

In [None]:
from PETsARD import Splitter


split = Splitter(
    num_samples=30,
    train_split_ratio=0.1
)
split.split(data=load.data)
print(split.data[1]['train'].shape[0])
print(split.data[1]['validation'].shape[0])
print(split.data[1]['train'].head(1))
print(split.data[1]['validation'].head(1))


4884
43958
   age  workclass  fnlwgt   education  educational-num      marital-status  \
0   28  Local-gov  336951  Assoc-acdm               12  Married-civ-spouse   

        occupation relationship   race gender  capital-gain  capital-loss  \
0  Protective-serv      Husband  White   Male             0             0   

   hours-per-week native-country income  
0              40  United-States   >50K  
   age workclass  fnlwgt education  educational-num marital-status  \
0   25   Private  226802      11th                7  Never-married   

          occupation relationship   race gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black   Male             0             0   

   hours-per-week native-country income  
0              40  United-States  <=50K  


## Processor: transform()

In [None]:
from PETsARD import Processor


proc = Processor(
    metadata=load.metadata,
)
proc.fit(
    data=split.data[1]['train'],
)
preproc_data = proc.transform(
    data=split.data[1]['train']
)
print(preproc_data.head(1))

        age  workclass    fnlwgt  education  educational-num  marital-status  \
0 -0.780418   0.883733  1.454353   0.901467         0.764544        0.351167   

   occupation  relationship      race    gender  capital-gain  capital-loss  \
0    0.976728       0.00389  0.023586  0.146383     -0.150177       -0.2158   

   hours-per-week  native-country    income  
0       -0.017024        0.492324  0.765818  


## Synthesizer

### Normal: Cont. as 0~1

In [None]:
from PETsARD import Synthesizer


sdv_methods = [
    # 'sdv-single_table-copulagan',
    # 'sdv-single_table-ctgan',
    'sdv-single_table-gaussiancopula',
    # 'sdv-single_table-tvae'
]

smartnoise_methods = [
    # 'smartnoise-mwem',
]
# 可能由於版本限制，無法執行 aim
 # 'smartnoise-aim',
# GAN系未支援
 # 'smartnoise-dpctgan',
 # 'smartnoise-patectgan',
 # 'smartnoise-dpgan',
 # 'smartnoise-pategan',

for synthesizing_method in sdv_methods + smartnoise_methods:
    print(f"Synthesizing method: {synthesizing_method}")
    syn = Synthesizer(
        method=synthesizing_method,
        epsilon=10.0,
    )
    syn.create(data=preproc_data)
    syn.fit_sample()
    print(syn.data_syn.head(1))

Synthesizing method: sdv-single_table-gaussiancopula
Synthesizer (SDV - SingleTable): Metafile loading time: 0.0236 sec.
Synthesizer (SDV - SingleTable): Fitting GaussianCopula.
Synthesizer (SDV - SingleTable): Fitting  GaussianCopula spent 1.8214 sec.
Synthesizer (SDV - SingleTable): Sampling GaussianCopula # 2708 rows (same as raw) in 0.3409 sec.
        age  workclass   fnlwgt  education  educational-num  marital-status  \
0  0.124381   0.514998 -0.65838   0.515023         1.705128        0.615378   

   occupation  relationship      race    gender  capital-gain  capital-loss  \
0    0.840579      0.143538  0.322134  0.154478     -0.150177       -0.2158   

   hours-per-week  native-country    income  
0          0.0683        0.710517  0.861915  


### Categorical
for mst, pacsynth in smartnoise

`ValueError: The transformer appears to have some continuous columns. Please provide only categorical or ordinal.`

In [None]:
# proc_cate = Processor(
#     metadata=loader.metadata,
# )

# metadata_col = loader.metadata.metadata['col']
# colnames_discrete = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
# proc_cate.update_config(
#     {'encoder': {col: 'encoder_label' if col in colnames_discrete else None for col in metadata_col},
#      'scaler': {col: None for col in metadata_col},
#      }
# )

# proc_cate.fit(
#     data=splitter.data[1]['train'],
#     sequence=None
# )
# preproc_data_cate = proc_cate.transform(
#     data=splitter.data[1]['train']
# )
# print(preproc_data_cate.head(1))

In [None]:
# smartnoise_methods_cate = [
#     'smartnoise-mst',
#     'smartnoise-pacsynth',
# ]

# for col in preproc_data_cate.columns:
#     preproc_data_cate[col] = preproc_data_cate[col].astype('category')

# for synthesizing_method in smartnoise_methods_cate:
#     print(f"Synthesizing method: {synthesizing_method}")
#     synthesizer_cate = PETsARD.Synthesizer(
#         data=preproc_data_cate,
#         synthesizing_method=synthesizing_method,
#         epsilon=1.0,
#     )
#     synthesizer_cate.fit_sample()
#     print(synthesizer_cate.data_syn.head(1))

## Processor: inverse_transform()

In [None]:
postproc_data = proc.inverse_transform(
    data=syn.data_syn
)
print(postproc_data.head(1))

         age workclass         fnlwgt     education  educational-num  \
0  40.257969   Private  119697.654351  Some-college        14.405845   

  marital-status        occupation relationship   race gender  capital-gain  \
0  Never-married  Transport-moving      Husband  White   Male  1.136868e-13   

   capital-loss  hours-per-week native-country income  
0           0.0       41.055941  United-States   >50K  


## Evaluator

### Anonymeter

In [None]:
from PETsARD import Evaluator


eval = Evaluator(
    method='anonymeter-singlingout_univariate',
    n_attacks=2 # 2000
)
eval.create(
    data={
        'ori': split.data[1]['train'],
        'syn': postproc_data,
        'control': split.data[1]['validation']
    }
)
eval.eval()
eval.get_global()

  self._sanity_check()


Unnamed: 0,risk,risk_CI_btm,risk_CI_top,attack_rate,attack_rate_err,baseline_rate,baseline_rate_err,control_rate,control_rate_err
result,0.0,0.0,0.69281,0.32881,0.32881,0.32881,0.32881,0.32881,0.32881


In [None]:
eval = PETsARD.Evaluator(
    method='anonymeter-linkability',
    n_attacks=2, # 2000,
    n_neighbors=10,
    aux_cols=[
        ['age', 'fnlwgt', 'race', 'gender', 'native-country'],
        ['workclass', 'education', 'capital-gain', 'capital-loss', 'hours-per-week']
    ]
)
eval.create(
    data={
        'ori': split.data[1]['train'],
        'syn': postproc_data,
        'control': split.data[1]['validation']
    }
)
eval.eval()
eval.get_global()

  self._sanity_check()


Unnamed: 0,risk,risk_CI_btm,risk_CI_top,attack_rate,attack_rate_err,baseline_rate,baseline_rate_err,control_rate,control_rate_err
result,0.0,0.0,0.69281,0.32881,0.32881,0.32881,0.32881,0.32881,0.32881


In [None]:
eval = PETsARD.Evaluator(
    method='anonymeter-inference',
    n_attacks=2, #2000,
    secret='age'
)
eval.create(
    data={
        'ori': split.data[1]['train'],
        'syn': postproc_data,
        'control': split.data[1]['validation']
    }
)
eval.eval()
eval.get_global()

  self._sanity_check()


Unnamed: 0,risk,risk_CI_btm,risk_CI_top,attack_rate,attack_rate_err,baseline_rate,baseline_rate_err,control_rate,control_rate_err
result,0.0,0.0,0.69281,0.32881,0.32881,0.32881,0.32881,0.32881,0.32881


### SDMetrics

In [None]:
eval = PETsARD.Evaluator(
    method='sdmetrics-single_table-qualityreport',
)
eval.create(
    data={
        'ori': split.data[1]['train'],
        'syn': postproc_data,
    }
)
eval.eval()
eval.get_global()

Generating report ...
(1/2) Evaluating Column Shapes: :   0%|          | 0/15 [00:00<?, ?it/s]

(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 63.06it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:02<00:00, 38.04it/s]

Overall Score: 74.44%

Properties:
- Column Shapes: 87.95%
- Column Pair Trends: 60.93%


Unnamed: 0,Score,Column Shapes,Column Pair Trends
result,0.744405,0.879508,0.609302


In [None]:
eval = PETsARD.Evaluator(
    method='sdmetrics-single_table-diagnosticreport',
)
eval.create(
    data={
        'ori': split.data[1]['train'],
        'syn': postproc_data,
    }
)
eval.eval()
eval.get_global()

Generating report ...
(1/2) Evaluating Data Validity: : 100%|██████████| 15/15 [00:00<00:00, 646.26it/s]
(2/2) Evaluating Data Structure: : 100%|██████████| 1/1 [00:00<00:00, 174.82it/s]

Overall Score: 100.0%

Properties:
- Data Validity: 100.0%
- Data Structure: 100.0%


Unnamed: 0,Score,Data Validity,Data Structure
result,1.0,1.0,1.0


## Describer

In [None]:
from PETsARD import Describer

desc = PETsARD.Describer(
    config={'method': 'default'},
)
desc.create(
    data={
        'data': split.data[1]['train'],
    }
)
desc.eval()
desc.get_global()

Unnamed: 0,row_count,col_count,na_count
0,4884,15,369


In [None]:
from PETsARD.operator import DescriberOperator

desc = DescriberOperator(
    config={'method': 'default'},
)
desc.run(input={'data': {'data': split.data[1]['train']}})
desc.get_result()

{'global':    row_count  col_count  na_count
 0       4884         15       369,
 'columnwise':                           mean    median            std      min        max  \
 age                  38.572891      37.0      13.549109     17.0       90.0   
 fnlwgt           187399.196970  177120.5  102840.990508  19793.0  1455435.0   
 educational-num      10.044431      10.0       2.558085      1.0       16.0   
 capital-gain        883.443284       0.0    5883.267006      0.0    99999.0   
 capital-loss         87.815930       0.0     406.972818      0.0     4356.0   
 hours-per-week       40.210688      40.0      12.376924      1.0       99.0   
 workclass                  NaN       NaN            NaN      NaN        NaN   
 education                  NaN       NaN            NaN      NaN        NaN   
 marital-status             NaN       NaN            NaN      NaN        NaN   
 occupation                 NaN       NaN            NaN      NaN        NaN   
 relationship            

## Reporter

In [1]:
import os
import pprint
import sys

path_petsard = os.path.dirname(os.path.dirname(os.getcwd()))
print(path_petsard)
sys.path.append(path_petsard)

pp = pprint.PrettyPrinter(depth=2)

d:\Dropbox\89_other_application\GitHub\PETsARD


In [2]:
from PETsARD import Loader, Evaluator


load_ori = Loader(filepath='../ori.csv')
load_syn = Loader(filepath='../syn.csv')
load_control = Loader(filepath='../control.csv')

load_ori.load()
load_syn.load()
load_control.load()

eval_data = {
    'ori': load_ori.data,
    'syn': load_syn.data,
    'control': load_control.data
}
eval = Evaluator(method='anonymeter-singlingout_univariate', n_attacks=2)
eval.create(data=eval_data)
eval.eval()

  self._sanity_check()


In [3]:
report_config = {
    'method': 'save_data',
    'output': 'test',
    'source': ['Synthesizer', 'sd-qlt'],
}
report_data = {
    ('Evaluator', 'sd-qlt_[global]'): eval.get_global(),
    ('Evaluator', 'sd-qlt_[columnwise]'): eval.get_columnwise(),
    ('Evaluator', 'sd-qlt_[pairwise]'): eval.get_pairwise()
}
report_data


{('Evaluator',
  'sd-qlt_[global]'):         risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
 result   0.0          0.0      0.69281      0.32881          0.32881   
 
         baseline_rate  baseline_rate_err  control_rate  control_rate_err  
 result        0.32881            0.32881       0.32881           0.32881  ,
 ('Evaluator', 'sd-qlt_[columnwise]'): None,
 ('Evaluator', 'sd-qlt_[pairwise]'): None}

In [4]:
from PETsARD.operator import ReporterOperator

rpt = ReporterOperator(config=report_config)
rpt.run(input={'data': report_data})

Now is test_Evaluator[sd-qlt_[global]] save to csv...


In [6]:
rpt.reporter.reporter.data

{'Evaluator[sd-qlt_[global]]':         risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
 result   0.0          0.0      0.69281      0.32881          0.32881   
 
         baseline_rate  baseline_rate_err  control_rate  control_rate_err  
 result        0.32881            0.32881       0.32881           0.32881  ,
 'Evaluator[sd-qlt_[columnwise]]': None,
 'Evaluator[sd-qlt_[pairwise]]': None}

In [7]:
rpt.reporter.reporter.data

{}

In [2]:
from copy import deepcopy

from PETsARD import Config, Status


filename = 'Exec_Yaml.yaml'

cfg = Config(filename=filename)
sts: Status = Status(config=cfg)
result = {}

while cfg.config.qsize() > 0:
    ops    = cfg.config.get()
    module = cfg.module_flow.get()
    expt   = cfg.expt_flow.get()
    exclude_index: list = []

    print(f"module {module}")

    ops.run(ops.set_input(status=sts))

    if module == 'Reporter':
        break

    sts.put(module, expt, ops)

    if module == cfg.sequence[-1]:
        full_expt = sts.get_full_expt()
        full_expt_str = '_'.join(
            [f"{module}[{expt}]" for module, expt in full_expt.items()]
        )
        result[full_expt_str] = deepcopy(sts.get_result(module=module))


module Loader
Loader - Benchmarker: file benchmark\adult.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
module Splitter
module Preprocessor
module Synthesizer
Synthesizer (SDV - SingleTable): Metafile loading time: 0.0161 sec.
Synthesizer (SDV - SingleTable): Fitting GaussianCopula.
Synthesizer (SDV - SingleTable): Fitting  GaussianCopula spent 9.4189 sec.
Synthesizer (SDV - SingleTable): Sampling GaussianCopula # 21462 rows (same as raw) in 1.5624 sec.
module Postprocessor
module Evaluator
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 74.03it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:05<00:00, 18.96it/s]

Overall Score: 74.2%

Properties:
- Column Shapes: 90.83%
- Column Pair Trends: 57.56%
module Reporter
Now is Exec_Yaml_Loader[adult]_Splitter[0.8_[2-1]]_Preprocessor[missing-drop]_Synthesizer[sdv-gaussian] save to csv...
Now is Exec_Yaml_L

In [9]:
ops.set_input(status=sts)['data'].keys()

dict_keys([('Loader', 'adult'), ('Splitter', '0.8_[1|2]_[train]_[validation]'), ('Preprocessor', 'missing-drop'), ('Synthesizer', 'sdv-gaussian'), ('Postprocessor', 'missing-drop'), ('Evaluator', 'sd-qlt_[global]_[columnwise]_[pairwise]')])

In [15]:
sts.status

{'Loader': {'expt': 'adult',
  'operator': <PETsARD.operator.LoaderOperator at 0x209715b0b80>},
 'Splitter': {'expt': '0.8_[1|2]',
  'operator': <PETsARD.operator.SplitterOperator at 0x209715b28f0>},
 'Preprocessor': {'expt': 'missing-drop',
  'operator': <PETsARD.operator.PreprocessorOperator at 0x209358725c0>},
 'Synthesizer': {'expt': 'sdv-gaussian',
  'operator': <PETsARD.operator.SynthesizerOperator at 0x2093593cb80>},
 'Postprocessor': {'expt': 'missing-drop',
  'operator': <PETsARD.operator.PostprocessorOperator at 0x20935873370>},
 'Evaluator': {'expt': 'sd-qlt',
  'operator': <PETsARD.operator.EvaluatorOperator at 0x209358907f0>}}

In [14]:

module_idx = sts.sequence.index('Preprocessor') + 1
sub_sequence = sts.sequence[:module_idx]
{
                module: sts.status[seq_module]['expt']
                for seq_module in sub_sequence
            }

{'Reporter': 'missing-drop'}

In [5]:
{
module: sts.status[seq_module]['expt']
                for seq_module in sts.sequence if seq_module in sts.status
            }

{'Reporter': 'sd-qlt'}

In [3]:
ops.set_input(status=sts)

KeyError: None

In [4]:
sts.get_full_expt()

{None: 'sd-qlt'}

In [3]:
sts.get_full_expt()

{None: 'sd-qlt'}

In [6]:
sts.get_full_expt('Synthesizer')

{'Loader': 'adult',
 'Splitter': '0.5_[2|2]',
 'Preprocessor': 'missing-drop',
 'Synthesizer': 'sdv-gaussian'}

'Loader[adult]_Splitter[0.5_[2|2]]_Preprocessor[missing-drop]_Synthesizer[sdv-gaussian]'

In [6]:
full_expt = sts.get_full_expt()
sequence = sts.sequence

input = {}
for module, expt in full_expt.items():
    result = sts.get_result(module=module)
    if isinstance(result, dict):
        for key, value in result.items():
            index = (module, expt, key)
            input[index] = deepcopy(value)
    else:
        index = (module, expt)
        input[index] = deepcopy(result)


source = ['Synthesizer', 'Evaluator', 'missing-drop']

pp.pprint(input)

{('Evaluator', 'sd-qlt', 'columnwise'):                       Property        Metric     Score
age              Column Shapes  KSComplement  0.939937
workclass        Column Shapes  TVComplement  0.993062
fnlwgt           Column Shapes  KSComplement  0.962958
education        Column Shapes  TVComplement  0.550583
educational-num  Column Shapes  KSComplement  0.793711
marital-status   Column Shapes  TVComplement  0.971827
occupation       Column Shapes  TVComplement  0.975564
relationship     Column Shapes  TVComplement  0.964520
race             Column Shapes  TVComplement  0.995613
gender           Column Shapes  TVComplement  0.993206
capital-gain     Column Shapes  KSComplement  0.919209
capital-loss     Column Shapes  KSComplement  0.953401
hours-per-week   Column Shapes  KSComplement  0.684535
native-country   Column Shapes  TVComplement  0.992433
income           Column Shapes  TVComplement  0.990853,
 ('Evaluator', 'sd-qlt', 'global'):            Score  Column Shapes  Column Pai

In [None]:
from PETsARD import Reporter


rpt = Reporter(
    method='save_report'
)
rpt.create(data=eval.get_global())

from PETsARD.report import Reporter



# Functional Test: Executor

## run()

In [None]:
import os
from pprint import pprint
import sys

sys.path.append(os.path.dirname(os.getcwd()))


para_Executor = {
    'Loader': {
        'adult': {
            'filepath': 'benchmark://adult',
            'na_values': {k: '?' for k in [
                'workclass',
                'occupation',
                'native-country'
            ]}
        }
    },
    'Splitter': {
        '0.8': {
            'num_samples': 2,
            'train_split_ratio': 0.8,
        }
    },
    'Processor': {
        'drop-IQR-stanard-label': {
            'missing': {
                'method': 'missing_drop',
                'all': True
            },
            #'method': , # ValueError: y contains previously unseen labels:
            'encoder': [
                {'method': 'encoder_label',
                 'include': ['education','marital-status','relationship','gender']
                },
                {'method': 'encoder_uniform',
                 'include': ['workclass', 'occupation', 'race', 'native-country', 'income']
                }
            ],
            'outlier': {
                'method': 'outlier_iqr',
                'include': 'hours-per-week'
            },
            'scaler': {
                'method': 'scaler_standard',
                'exclude': ['hours-per-week',
                    'workclass', 'education', 'marital-status',
                    'occupation', 'relationship', 'race', 'gender',
                    'native-country', 'income'
                ]
            }
        }
    },
    'Synthesizer': {
        'GaussianCopula': {
            'synthesizing_method': 'sdv-singletable-gaussiancopula'
        }
    },
    'Evaluator': {
        'anonymeter-SinglingOut': {
            'evaluating_method': 'anonymeter-singlingout-univariate',
            'anonymeter_n_attacks': 1,  # 2000'
            'anonymeter_num_samples': 2
        }
    }
}

executor_single = PETsARD.Executor(**para_Executor)
executor_single.run()
pprint(
    executor_single.evaluator[(
        'adult',
        '0.8',
        1,
        'drop-IQR-stanard-label',
        'GaussianCopula',
        'anonymeter-SinglingOut',
        1
    )].Evaluator.evaluation
)

Loader - Benchmarker: file benchmark\adult.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
Executor - Loader: adult loading time: 7.0139 sec.
Executor - Splitter: 0.8 splitting time: 0.0689 sec.
No self-defined config passed.  Generate a config automatically.
Executor - Processor (preprocessing): drop-IQR-stanard-label processing time: 0.4141 sec.
Synthesizer (SDV - SingleTable): Metafile loading time: 0.0469 sec.
Synthesizer (SDV - SingleTable): Fitting GaussianCopula.
Synthesizer (SDV - SingleTable): Fitting  GaussianCopula spent 7.8931 sec.
Synthesizer (SDV - SingleTable): Sampling GaussianCopula # 21557 rows (same as raw) in 1.2511 sec.
Executor - Synthesizer: GaussianCopula synthesizing time: 9.1911 sec.
Executor - Processor (postprocessing): drop-IQR-stanard-label processing time: 0.0288 sec.
Evaluator (Anonymeter - SinglingOut - Univariate): Now is SinglingOut - Univariate Evaluator
Evaluator (Anonymeter - Sing

## .run_parallel()
Not applicable now

In [None]:
import os
from pprint import pprint
import sys

sys.path.append('D:\\Dropbox\\89_其他應用\\GitHub\\PETsARD')
os.chdir('D:\\Dropbox\\89_其他應用\\GitHub\\PETsARD\\demo')

import PETsARD


para_Executor = {
    'Loader': {
        'adult': {
            'filepath': 'benchmark://adult',
            'na_values': {k: '?' for k in [
                'workclass',
                'occupation',
                'native-country'
            ]}
        }
    },
    'Splitter': {
        '0.8': {
            'num_samples': 2,
            'train_split_ratio': 0.8,
        }
    },
    'Processor': {
        'drop-IQR-stanard-label': {
            'missingist': {
                'method': 'missingist_drop',
                'all': True
            },
            #'method': , # ValueError: y contains previously unseen labels:
            'encoder': [
                {'method': 'encoder_label',
                 'include': ['education','marital-status','relationship','gender']
                },
                {'method': 'encoder_uniform',
                 'include': ['workclass', 'occupation', 'race', 'native-country', 'income']
                }
            ],
            'outlierist': {
                'method': 'outlierist_iqr',
                'include': 'hours-per-week'
            },
            'scaler': {
                'method': 'scaler_standard',
                'exclude': ['hours-per-week',
                    'workclass', 'education', 'marital-status',
                    'occupation', 'relationship', 'race', 'gender',
                    'native-country', 'income'
                ]
            }
        }
    },
    'Synthesizer': {
        'GaussianCopula': {
            'synthesizing_method': 'sdv-singletable-gaussiancopula'
        }
    },
    'Evaluator': {
        'anonymeter-SinglingOut': {
            'evaluating_method': 'anonymeter-singlingout-univariate',
            'anonymeter_n_attacks': 1,  # 2000'
            'anonymeter_num_samples': 2
        }
    }
}

# Processor contains lambda function, but python couldn't pickle it.
# so Processor .run_parallel() didn't valid after Processor migration.
executor_parallel = PETsARD.Executor(**para_Executor)
executor_parallel.run_parallel()
pprint(
    executor_parallel.evaluator[(
        'adult',
        '0.8',
        1,
        'drop-IQR-stanard-label',
        'GaussianCopula',
        'anonymeter-SinglingOut',
        1
    )].Evaluator.evaluation
)

Loading:   0%|          | 0/1 [00:00<?, ?it/s]
Processing:   0%|          | 0/1 [00:20<?, ?it/s]s/it]
Splitting: 100%|██████████| 1/1 [00:20<00:00, 20.74s/it]
Loading: 100%|██████████| 1/1 [00:20<00:00, 20.74s/it]


AttributeError: Can't pickle local object 'Processor.__init__.<locals>.<lambda>'

# Un arrangment

In [None]:
import pandas as pd


def Result(__evaluator):
    import numpy as np
    __dict_result = {}
    __para_to_handle = [('Risk', ['risk()', 'value']), ('Risk_CI_btm', ['risk()', 'ci[0]']), ('Risk_CI_top', ['risk()', 'ci[1]']), ('Attack_Rate', ['results()', 'attack_rate', 'value']), ('Attack_Rate_err', ['results()', 'attack_rate', 'error']), ('Baseline_Rate', ['results()', 'baseline_rate', 'value']), ('Baseline_Rate_err', ['results()', 'baseline_rate', 'error']), ('Control_Rate', ['results()', 'control_rate', 'value']), ('Control_Rate_err', ['results()', 'control_rate', 'error'])
                        ]
    for __key, __attrs in __para_to_handle:
        try:
            __attr_value = __evaluator
            for __attr in __attrs:
                if '()' in __attr:
                    __method_name = __attr.split('(')[0]
                    if hasattr(__attr_value, __method_name):
                        __method = getattr(__attr_value, __method_name)
                        if callable(__method):
                            __attr_value = __method()
                        else:
                            __dict_result[__key] = np.nan
                            break
                    else:
                        __dict_result[__key] = np.nan
                        break
                elif '[' in __attr:
                    __attr_name = __attr.split('[')[0]
                    __index = int(__attr.split('[')[1].rstrip(']'))
                    if hasattr(__attr_value, __attr_name)\
                            and isinstance(getattr(__attr_value, __attr_name), (list, dict, tuple)):
                        try:
                            __attr_value = getattr(
                                __attr_value, __attr_name)[__index]
                        except (IndexError, KeyError):
                            __dict_result[__key] = np.nan
                            break
                    else:
                        __dict_result[__key] = np.nan
                        break
                else:
                    __attr_value = getattr(__attr_value, __attr)
            __dict_result[__key] = __attr_value
        except Exception as e:
            __dict_result[__key] = np.nan
    return __dict_result


for i in ['01', '02', '03']:  # ,'04','05','06'
    from anonymeter.evaluators import SinglingOutEvaluator
    print(f"PETsARD[20231224-085805]_Trial[{i}][Ori].csv")
    evaluator = SinglingOutEvaluator(ori=pd.read_csv(f"PETsARD[20231224-085805]_Trial[{i}][Ori].csv"), syn=pd.read_csv(f"PETsARD[20231224-085805]_Trial[{i}-1-1]Postproc.csv"), control=pd.read_csv(f"PETsARD[20231224-085805]_Trial[{i}][Ctrl].csv"), n_attacks=2000
                                     )
    try:
        evaluator.evaluate(mode='univariate')
        print(Result(evaluator))
    except RuntimeError as ex:
        print(f"Singling out evaluation failed with {ex}."
              "Please re-run this cell."
              "For more stable results increase `n_attacks`. Note that this will "
              "make the evaluation slower.")


In [None]:
import itertools
for type in ['][Ori]', '][Ctrl]', '-1-1]Postproc']:
    for combo in itertools.combinations(['01', '02', '03', '04', '05', '06'], 2):
        df_a = pd.read_csv(
            f"PETsARD[20231224-085805]_Trial[{combo[0]}{type}.csv")
        df_b = pd.read_csv(
            f"PETsARD[20231224-085805]_Trial[{combo[1]}{type}.csv")
        if df_a.equals(df_b):
            print(type+': '+str(combo))
            print("They're same!!??")
        # else:
            # print(type+': '+str(combo))
            # print("They're different.")
print('done.')


In [None]:
import pandas as pd


def Result(__evaluator):
    import numpy as np
    __dict_result = {}
    __para_to_handle = [('Risk', ['risk()', 'value']), ('Risk_CI_btm', ['risk()', 'ci[0]']), ('Risk_CI_top', ['risk()', 'ci[1]']), ('Attack_Rate', ['results()', 'attack_rate', 'value']), ('Attack_Rate_err', ['results()', 'attack_rate', 'error']), ('Baseline_Rate', ['results()', 'baseline_rate', 'value']), ('Baseline_Rate_err', ['results()', 'baseline_rate', 'error']), ('Control_Rate', ['results()', 'control_rate', 'value']), ('Control_Rate_err', ['results()', 'control_rate', 'error'])
                        ]
    for __key, __attrs in __para_to_handle:
        try:
            __attr_value = __evaluator
            for __attr in __attrs:
                if '()' in __attr:
                    __method_name = __attr.split('(')[0]
                    if hasattr(__attr_value, __method_name):
                        __method = getattr(__attr_value, __method_name)
                        if callable(__method):
                            __attr_value = __method()
                        else:
                            __dict_result[__key] = np.nan
                            break
                    else:
                        __dict_result[__key] = np.nan
                        break
                elif '[' in __attr:
                    __attr_name = __attr.split('[')[0]
                    __index = int(__attr.split('[')[1].rstrip(']'))
                    if hasattr(__attr_value, __attr_name)\
                            and isinstance(getattr(__attr_value, __attr_name), (list, dict, tuple)):
                        try:
                            __attr_value = getattr(
                                __attr_value, __attr_name)[__index]
                        except (IndexError, KeyError):
                            __dict_result[__key] = np.nan
                            break
                    else:
                        __dict_result[__key] = np.nan
                        break
                else:
                    __attr_value = getattr(__attr_value, __attr)
            __dict_result[__key] = __attr_value
        except Exception as e:
            __dict_result[__key] = np.nan
    return __dict_result


for i in ['01', '02', '03']:  # ,'04','05','06'
    from anonymeter.evaluators import SinglingOutEvaluator
    print(f"PETsARD[20231224-085805]_Trial[{i}][Ori].csv")
    evaluator = SinglingOutEvaluator(ori=pd.read_csv(f"PETsARD[20231224-085805]_Trial[{i}][Ori].csv"), syn=pd.read_csv(f"PETsARD[20231224-085805]_Trial[{i}-1-1]Postproc.csv"), control=pd.read_csv(f"PETsARD[20231224-085805]_Trial[{i}][Ctrl].csv"), n_attacks=2000
                                     )
    try:
        evaluator.evaluate(mode='univariate')
        print(Result(evaluator))
    except RuntimeError as ex:
        print(f"Singling out evaluation failed with {ex}."
              "Please re-run this cell."
              "For more stable results increase `n_attacks`. Note that this will "
              "make the evaluation slower.")


Executor - Loader: adult loading time: 6.8097 sec.
Executor - Splitter: 0.8 splitting time: 0.339 sec.
Preprocessor - Outlierist (IQR): Dropped  1060 rows on fnlwgt         . Kept [-63981.5, 419234.5] only.
Preprocessor - Outlierist (IQR): Dropped   227 rows on educational-num. Kept [3.0, 19.0] only.
Preprocessor - Outlierist (IQR): Dropped  1705 rows on capital-loss   . Kept [0.0, 0.0] only.
Preprocessor - Outlierist (IQR): Dropped  9432 rows on hours-per-week . Kept [32.5, 52.5] only.
Preprocessor - Outlierist (IQR): Dropped   214 rows on age            . Kept [-0.5, 75.5] only.
Preprocessor - Outlierist (IQR): Dropped  3030 rows on capital-gain   . Kept [0.0, 0.0] only.
Preprocessor - Outlierist (IQR): Totally Dropped 13932 in 36207 rows.
Preprocessor - Encoder (Label): Column native-country  been labelized from 0 to 39.
Preprocessor - Encoder (Label): Column gender          been labelized from 0 to  1.
Preprocessor - Encoder (Label): Column race            been labelized from 0 to 

Found 765 failed queries out of 2000. Check DEBUG messages for more details.


Executor - Evaluator: anonymeter-SinglingOut at 1 trials evaluating time: 131.365 sec.
Evaluator (Anonymeter - SinglingOut - Univariate): Now is SinglingOut - Univariate Evaluator
Evaluator (Anonymeter - SinglingOut - Univariate): Evaluator time: 0.0322 sec.
Evaluator (Anonymeter): Evaluating  SinglingOut - Univariate.


Found 802 failed queries out of 2000. Check DEBUG messages for more details.


Executor - Evaluator: anonymeter-SinglingOut at 2 trials evaluating time: 131.1331 sec.
Evaluator (Anonymeter - SinglingOut - Univariate): Now is SinglingOut - Univariate Evaluator
Evaluator (Anonymeter - SinglingOut - Univariate): Evaluator time: 0.0336 sec.
Evaluator (Anonymeter): Evaluating  SinglingOut - Univariate.


Found 830 failed queries out of 2000. Check DEBUG messages for more details.


Executor - Evaluator: anonymeter-SinglingOut at 3 trials evaluating time: 131.5346 sec.
Evaluator (Anonymeter - SinglingOut - Univariate): Now is SinglingOut - Univariate Evaluator
Evaluator (Anonymeter - SinglingOut - Univariate): Evaluator time: 0.0356 sec.
Evaluator (Anonymeter): Evaluating  SinglingOut - Univariate.


Found 794 failed queries out of 2000. Check DEBUG messages for more details.


Executor - Evaluator: anonymeter-SinglingOut at 4 trials evaluating time: 131.4821 sec.
Evaluator (Anonymeter - SinglingOut - Univariate): Now is SinglingOut - Univariate Evaluator
Evaluator (Anonymeter - SinglingOut - Univariate): Evaluator time: 0.0351 sec.
Evaluator (Anonymeter): Evaluating  SinglingOut - Univariate.


Found 821 failed queries out of 2000. Check DEBUG messages for more details.


Executor - Evaluator: anonymeter-SinglingOut at 5 trials evaluating time: 132.587 sec.
Evaluator (Anonymeter - SinglingOut - Univariate): Now is SinglingOut - Univariate Evaluator
Evaluator (Anonymeter - SinglingOut - Univariate): Evaluator time: 0.036 sec.
Evaluator (Anonymeter): Evaluating  SinglingOut - Univariate.


Found 800 failed queries out of 2000. Check DEBUG messages for more details.


Executor - Evaluator: anonymeter-SinglingOut at 6 trials evaluating time: 131.8783 sec.
Evaluator (Anonymeter - SinglingOut - Univariate): Now is SinglingOut - Univariate Evaluator
Evaluator (Anonymeter - SinglingOut - Univariate): Evaluator time: 0.0352 sec.
Evaluator (Anonymeter): Evaluating  SinglingOut - Univariate.


Found 799 failed queries out of 2000. Check DEBUG messages for more details.


In [None]:
import pandas as pd
for i in ['01', '02', '03']:  # ,'04','05','06'
    eval = PETsARD.Evaluator(evaluating_method='anonymeter-singlingout-univariate', data={'ori': pd.read_csv(f"PETsARD[20231224-085805]_Trial[{i}][Ori].csv"), 'syn': pd.read_csv(f"PETsARD[20231224-085805]_Trial[{i}-1-1]Postproc.csv"), 'control': pd.read_csv(f"PETsARD[20231224-085805]_Trial[{i}][Ctrl].csv")
                                                                                          }, anonymeter_n_attacks=500
                             )
    eval.eval()
    print(eval.Evaluator.evaluation)


In [None]:
import itertools
for type in ['][Ori]', '][Ctrl]', '-1-1]Postproc']:
    for combo in itertools.combinations(['01', '02', '03', '04', '05', '06'], 2):
        df_a = pd.read_csv(
            f"PETsARD[20231224-085805]_Trial[{combo[0]}{type}.csv")
        df_b = pd.read_csv(
            f"PETsARD[20231224-085805]_Trial[{combo[1]}{type}.csv")
        if df_a.equals(df_b):
            print(type+': '+str(combo))
            print("They're same!!??")
        # else:
            # print(type+': '+str(combo))
            # print("They're different.")
print('done.')


Executor - Evaluator: anonymeter-SinglingOut at 7 trials evaluating time: 131.5421 sec.
Evaluator (Anonymeter - SinglingOut - Univariate): Now is SinglingOut - Univariate Evaluator
Evaluator (Anonymeter - SinglingOut - Univariate): Evaluator time: 0.0354 sec.
Evaluator (Anonymeter): Evaluating  SinglingOut - Univariate.


In [None]:
{
    'Loader': {
        'NHANES': {
            'filepath': '../[sunset]/data/[NHANES] B.csv',
            'header_exist': False,
            'header_names': ['gen', 'age', 'race', 'edu', 'mar', 'bmi', 'dep', 'pir', 'gh', 'mets', 'qm', 'dia']
        }
    }
}
