# Environment

In [1]:
from copy import deepcopy
import os
import pprint
import sys

import yaml

sys.path.append(os.path.dirname(os.getcwd()))


# Config

## Raw YAML

In [2]:
filename = 'DevTest_Exec_Yaml.yaml'

with open(filename, 'r') as yaml_file:
    yaml_raw: dict = yaml.safe_load(yaml_file)

pp = pprint.PrettyPrinter(depth=3)
pp.pprint(yaml_raw)
# print(yaml_raw.keys())

{'Evaluator': {'anony-inference': {'method': 'anonymeter-inference'},
               'anony-linkability': {'method': 'anonymeter-linkability'},
               'anony-singlingout': {'method': 'anonymeter_singlingout_univariate'},
               'sd-diag': {'method': 'sdmetrics-diagnosticreport'},
               'sd-qlt': {'method': 'sdmetrics-qualityreport'}},
 'Loader': {'adult': {'filepath': 'benchmark://adult', 'na_values': {...}},
            'adult_local': {'filepath': 'benchmark/adult.csv',
                            'na_values': {...}}},
 'Postprocessor': {'missing-drop': None},
 'Preprocessor': {'missing-drop': {'missing': {...}}},
 'Splitter': {'0.8': {'num_samples': 2, 'train_split_ratio': 0.8}},
 'Synthesizer': {'sdv-copulagan': {'method': 'sdv-ctgan'},
                 'sdv-gaussian': {'method': 'sdv-gaussiancopula'},
                 'sdv-tvae': {'method': 'sdv-tvae'}}}


## _splitter_handler

In [3]:
from PETsARD import Config


cfg = Config(filename=filename)

pp.pprint(yaml_raw['Splitter'])
pp.pprint(cfg.yaml['Splitter'])

{'0.8': {'num_samples': 2, 'train_split_ratio': 0.8}}
{'0.8_[1|2]': {'num_samples': 1, 'train_split_ratio': 0.8},
 '0.8_[2|2]': {'num_samples': 1, 'train_split_ratio': 0.8}}


# Operator

## LoaderOperator

In [4]:
from PETsARD.operator import LoaderOperator


load = LoaderOperator(config=cfg.yaml['Loader']['adult'])
load.run(input={})
pp.pprint(load.get_result().head(2))

Loader - Benchmarker: file benchmark\adult.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
   age workclass  fnlwgt education  educational-num      marital-status  \
0   25   Private  226802      11th                7       Never-married   
1   38   Private   89814   HS-grad                9  Married-civ-spouse   

          occupation relationship   race gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black   Male             0             0   
1    Farming-fishing      Husband  White   Male             0             0   

   hours-per-week native-country income  
0              40  United-States  <=50K  
1              50  United-States  <=50K  


## SplitterOperator

In [5]:
from PETsARD.operator import SplitterOperator


split = SplitterOperator(config=cfg.yaml['Splitter']['0.8_[1|2]'])
split.run(input={'data': load.get_result()})
print('train')
pp.pprint(split.get_result()['train'].head(2))
print('validation')
pp.pprint(split.get_result()['validation'].head(2))


train
   age workclass  fnlwgt education  educational-num      marital-status  \
0   25   Private  226802      11th                7       Never-married   
1   38   Private   89814   HS-grad                9  Married-civ-spouse   

          occupation relationship   race gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black   Male             0             0   
1    Farming-fishing      Husband  White   Male             0             0   

   hours-per-week native-country income  
0              40  United-States  <=50K  
1              50  United-States  <=50K  
validation
   age workclass  fnlwgt     education  educational-num      marital-status  \
0   27   Private  205145       HS-grad                9  Married-civ-spouse   
1   18       NaN  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct      Husband  White    Male             0             

## PreprocessorOperator

In [6]:
from PETsARD.operator import PreprocessorOperator


preproc = PreprocessorOperator(config=cfg.yaml['Preprocessor']['missing-drop'])
preproc.run(input={
    'data': split.get_result()['train'],
    'metadata': load.loader.metadata
})
pp.pprint(preproc.get_result().head(2))

        age  workclass    fnlwgt  education  educational-num  marital-status  \
0 -0.998707   0.230785  0.350681   0.811873        -1.201444        0.488192   
1 -0.049314   0.141095 -0.947105   0.191809        -0.423285        0.213126   

   occupation  relationship      race    gender  capital-gain  capital-loss  \
0    0.773858      0.780477  0.948399  0.566120     -0.145298     -0.216261   
1    0.932510      0.300713  0.610816  0.135063     -0.145298     -0.216261   

   hours-per-week  native-country    income  
0       -0.035942        0.414725  0.692520  
1        0.778276        0.126433  0.198601  


## SynthesizerOperator

In [7]:
from PETsARD.operator import SynthesizerOperator


syn = SynthesizerOperator(config=cfg.yaml['Synthesizer']['sdv-gaussian'])
syn.run(input={'data': preproc.get_result()})
pp.pprint(syn.get_result().head(2))

Synthesizer (SDV - SingleTable): Metafile loading time: 0.1065 sec.
Synthesizer (SDV - SingleTable): Fitting GaussianCopula.
Synthesizer (SDV - SingleTable): Fitting  GaussianCopula spent 8.128 sec.
Synthesizer (SDV - SingleTable): Sampling GaussianCopula # 21637 rows (same as raw) in 2.0305 sec.
        age  workclass    fnlwgt  education  educational-num  marital-status  \
0  0.664219   0.877736  0.196313   0.561844        -0.340629        0.267078   
1 -1.142178   0.961890 -0.184600   0.173993        -0.799866        0.479704   

   occupation  relationship      race    gender  capital-gain  capital-loss  \
0    0.025309      0.593232  0.264700  0.447200     -0.145298     -0.216261   
1    0.805191      0.552443  0.126146  0.628984     -0.145298     -0.216261   

   hours-per-week  native-country    income  
0        0.422792        0.374513  0.788171  
1        0.149619        0.702036  0.774617  


## PostprocessorOperator

In [8]:
from PETsARD.operator import PostprocessorOperator


postproc = PostprocessorOperator(config=cfg.yaml['Postprocessor']['missing-drop'])
postproc.run(input={
    'preprocessor': preproc.processor,
    'data': syn.get_result()
})
pp.pprint(postproc.get_result().head(2))

         age  workclass         fnlwgt  education  educational-num  \
0  47.770363  Local-gov  210507.672420  Bachelors         9.212439   
1  23.035469        NaN  170300.356166    HS-grad         8.032123   

       marital-status      occupation   relationship   race gender  \
0  Married-civ-spouse  Prof-specialty  Not-in-family  White   Male   
1       Never-married             NaN  Not-in-family  White   Male   

   capital-gain  capital-loss  hours-per-week native-country income  
0           0.0           0.0       45.634048  United-States   >50K  
1           0.0           0.0       42.279006  United-States   >50K  


## EvaluatorOperator

In [2]:
eval_data = {
    'ori': split.get_result()['train'],
    'syn': postproc.get_result(),
    'control': split.get_result()['validation'],
}

Load exist data

In [None]:
# split.get_result()['train'].to_csv('ori.csv', index=False, encoding='utf-8')
# postproc.get_result().to_csv('syn.csv', index=False, encoding='utf-8')
# split.get_result()['validation'].to_csv('control.csv', index=False, encoding='utf-8')

In [1]:
# from copy import deepcopy
# import os
# import pprint
# import sys

# import yaml

# sys.path.append(os.path.dirname(os.getcwd()))


# from PETsARD import Loader


# ori_load = Loader('ori.csv')
# syn_load = Loader('syn.csv')
# control_load = Loader('control.csv')
# ori_load.load()
# syn_load.load()
# control_load.load()

# eval_data = {
#     'ori':     syn_load.data,
#     'syn':     syn_load.data,
#     'control': control_load.data,
# }

# filename = 'DevTest_Exec_Yaml.yaml'

# from PETsARD import Config


# cfg = Config(filename=filename)

# from PETsARD import Evaluator

In [10]:
eval_singl = Evaluator(**cfg.yaml['Evaluator']['anony-singlingout'])
eval_singl.create(data=eval_data)
eval_singl.eval()
for method in ['get_global', 'get_columnwise', 'get_pairwise']:
    result = eval_singl.__getattribute__(method)()
    if result is None:
        print(f'{method} is None')
    else:
        print(result.head(1))

            risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result  0.998999     0.997997          1.0     0.999041         0.000959   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result       0.013934           0.005047      0.042753          0.008814  
get_columnwise is None
get_pairwise is None


In [11]:
eval_link = Evaluator(**cfg.yaml['Evaluator']['anony-linkability'])
eval_link.create(data=eval_data)
eval_link.eval()
for method in ['get_global', 'get_columnwise', 'get_pairwise']:
    result = eval_link.__getattribute__(method)()
    if result is None:
        print(f'{method} is None')
    else:
        print(result.head(1))

            risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result  0.289472      0.26902     0.309924      0.29689         0.020001   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result       0.006448           0.003374       0.01044           0.00435  
get_columnwise is None
get_pairwise is None


In [9]:
eval_infer = Evaluator(**cfg.yaml['Evaluator']['anony-inference'])
eval_infer.create(data=eval_data)
eval_infer.eval()
for method in ['get_global', 'get_columnwise', 'get_pairwise']:
    result = eval_infer.__getattribute__(method)()
    if result is None:
        print(f'{method} is None')
    else:
        print(result.head(1))

            risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result  0.998962     0.997923          1.0     0.999041         0.000959   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result       0.024413           0.006695      0.076813          0.011631  
get_columnwise is None
get_pairwise is None


In [6]:
eval_diag = Evaluator(**cfg.yaml['Evaluator']['sd-diag'])
eval_diag.create(data=eval_data)
eval_diag.eval()
for method in ['get_global', 'get_columnwise', 'get_pairwise']:
    result = eval_diag.__getattribute__(method)()
    if result is None:
        print(f'{method} is None')
    else:
        print(result.head(1))

Generating report ...
(1/2) Evaluating Data Validity: :   0%|          | 0/15 [00:00<?, ?it/s]

(1/2) Evaluating Data Validity: : 100%|██████████| 15/15 [00:00<00:00, 381.73it/s]
(2/2) Evaluating Data Structure: : 100%|██████████| 1/1 [00:00<00:00, 332.59it/s]

Overall Score: 100.0%

Properties:
- Data Validity: 100.0%
- Data Structure: 100.0%
        Score  Data Validity  Data Structure
result    1.0            1.0             1.0
          Property             Metric  Score
age  Data Validity  BoundaryAdherence    1.0
get_pairwise is None


In [8]:
eval_qlt = Evaluator(**cfg.yaml['Evaluator']['sd-qlt'])
eval_qlt.create(data=eval_data)
eval_qlt.eval()
for method in ['get_global', 'get_columnwise', 'get_pairwise']:
    result = eval_qlt.__getattribute__(method)()
    if result is None:
        print(f'{method} is None')
    else:
        print(result.head(1))

Generating report ...
(1/2) Evaluating Column Shapes: :   0%|          | 0/15 [00:00<?, ?it/s]

(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 195.87it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:04<00:00, 25.20it/s]

Overall Score: 100.0%

Properties:
- Column Shapes: 100.0%
- Column Pair Trends: 100.0%
        Score  Column Shapes  Column Pair Trends
result    1.0            1.0                 1.0
          Property        Metric  Score
age  Column Shapes  KSComplement    1.0
                            Property                 Metric  Score  \
(age, workclass)  Column Pair Trends  ContingencySimilarity    1.0   

                  Real Correlation  Synthetic Correlation  
(age, workclass)               NaN                    NaN  


In [1]:
from PETsARD.operator import EvaluatorOperator


eval = EvaluatorOperator(config=cfg.yaml['Evaluator']['anony-singlingout'])
eval.run(input={'data': eval_data})
pp.pprint(eval.get_result()['details']['Column Pair Trends'].head(2))

ModuleNotFoundError: No module named 'PETsARD'

In [2]:
from copy import deepcopy
import sys
from pprint import pprint


sys.path.append('D:/Dropbox/89_other_application/GitHub/PETsARD')
filename = 'DevTest_Exec_Yaml.yaml'

from PETsARD.config import Config, Status

cfg = Config(filename=filename)

pprint([cfg.config.get()      for _ in range(cfg.config.qsize()     )])
pprint([cfg.module_flow.get() for _ in range(cfg.module_flow.qsize())])
pprint([cfg.expt_flow.get()   for _ in range(cfg.expt_flow.qsize()  )])

AttributeError: 'AnonymeterSinglingOutUnivariate' object has no attribute 'data_ori'

In [3]:
cfg = Config(filename=filename)
sts: Status = Status(config=cfg)
pre_module: str = ''
result = {}

while cfg.config.qsize() > 0:
    ops    = cfg.config.get()
    module = cfg.module_flow.get()
    expt   = cfg.expt_flow.get()
    exclude_index: list = []

    print(f"module {module}")

    ops.run(ops.set_input(status=sts))

    sts.put(module, expt, ops)

    if module == cfg.sequence[-1]:
        full_expt = sts.get_full_expt()
        full_expt_str = '_'.join(
            [f"{module}[{expt}]" for module, expt in full_expt.items()]
        )
        result[full_expt_str] = deepcopy(sts.get_result(module=module))


module Loader
Loader - Benchmarker: file benchmark\adult.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
module Splitter
module Preprocessor
module Synthesizer
Synthesizer (SDV - SingleTable): Metafile loading time: 0.0786 sec.
Synthesizer (SDV - SingleTable): Fitting GaussianCopula.
Synthesizer (SDV - SingleTable): Fitting  GaussianCopula spent 10.2012 sec.
Synthesizer (SDV - SingleTable): Sampling GaussianCopula # 21661 rows (same as raw) in 1.4333 sec.
module Postprocessor
module Evaluator
Evaluator (SDMetrics): Evaluating QualityReport.
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 72.67it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:04<00:00, 24.42it/s]

Overall Score: 73.44%

Properties:
- Column Shapes: 90.43%
- Column Pair Trends: 56.45%
Evaluator (SDMetrics): Evaluating QualityReport spent 4.5076 sec.
module Splitter
module Preprocessor


In [7]:
import pprint
pp = pprint.PrettyPrinter(depth=1)
pp.pprint(result)

{'Loader[adult]_Splitter[0.5_[1|2]]_Preprocessor[missing-drop]_Synthesizer[sdv-gaussian]_Postprocessor[missing-drop]_Evaluator[sd-qlt]': {'details': {'Column Pair Trends':            Column 1         Column 2                 Metric     Score  \
0               age        workclass  ContingencySimilarity  0.864082   
1               age           fnlwgt  CorrelationSimilarity  0.997330   
2               age        education  ContingencySimilarity  0.387922   
3               age  educational-num  CorrelationSimilarity  0.994071   
4               age   marital-status  ContingencySimilarity  0.752679   
..              ...              ...                    ...       ...   
100    capital-loss   native-country  ContingencySimilarity  0.006675   
101    capital-loss           income  ContingencySimilarity  0.006675   
102  hours-per-week   native-country  ContingencySimilarity  0.523088   
103  hours-per-week           income  ContingencySimilarity  0.512204   
104  native-country      

In [10]:
from copy import deepcopy
import sys
from pprint import pprint


sys.path.append('D:/Dropbox/89_other_application/GitHub/PETsARD')
filename = 'DevTest_Exec_Yaml.yaml'

from PETsARD import Executor

exec = Executor(config=filename, sequence=['Loader', 'Evaluator'])
exec.run()


Now is Loader with adult...
Loader - Benchmarker: file benchmark\adult.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
Now is Splitter with 0.8_[1|2]...
Now is Preprocessor with missing-drop...
Now is Synthesizer with sdv-gaussian...
Synthesizer (SDV - SingleTable): Metafile loading time: 0.0156 sec.
Synthesizer (SDV - SingleTable): Fitting GaussianCopula.
Synthesizer (SDV - SingleTable): Fitting  GaussianCopula spent 8.7702 sec.
Synthesizer (SDV - SingleTable): Sampling GaussianCopula # 21595 rows (same as raw) in 1.7304 sec.
Now is Postprocessor with missing-drop...
Now is Evaluator with sd-qlt...
Evaluator (SDMetrics): Evaluating QualityReport.
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 83.92it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:05<00:00, 18.26it/s]

Overall Score: 73.91%

Properties:
- Column Shapes: 90.85%
- Column Pair Trends: 

ValueError: bins must increase monotonically.

In [None]:
sts = load('.pkl')
exec = Executor(config=filename)
exec.status = sts
exec.run()

In [11]:
exec.get_result()

{'Loader[adult]_Splitter[0.8_[1|2]]_Preprocessor[missing-drop]_Synthesizer[sdv-gaussian]_Postprocessor[missing-drop]_Evaluator[sd-qlt]': {'score': 0.7391440757657324,
  'properties': {'Column Shapes': {'Score': 0.9085121891000267},
   'Column Pair Trends': {'Score': 0.5697759624314381}},
  'details': {'Column Shapes':              Column        Metric     Score
   0               age  KSComplement  0.939809
   1         workclass  TVComplement  0.993492
   2            fnlwgt  KSComplement  0.955519
   3         education  TVComplement  0.504252
   4   educational-num  KSComplement  0.794910
   5    marital-status  TVComplement  0.967206
   6        occupation  TVComplement  0.978661
   7      relationship  TVComplement  0.963878
   8              race  TVComplement  0.994605
   9            gender  TVComplement  0.997304
   10     capital-gain  KSComplement  0.917974
   11     capital-loss  KSComplement  0.953523
   12   hours-per-week  KSComplement  0.692639
   13   native-country  T