# Environment setting
環境設定

In [1]:
import os
import requests
from pathlib import Path


# determine branch, default is main
branch = '628-guide---tutorial'  # 'main'

# Check if running in Google Colab
is_colab = 'COLAB_GPU' in os.environ

if is_colab:
    # Download the utils.py file from GitHub
    utils_url = f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    response = requests.get(utils_url)

    if response.status_code == 200:
        # Save the utils.py file
        with open('utils.py', 'w') as f:
            f.write(response.text)

        # Create an empty __init__.py
        Path('__init__.py').touch()
    else:
        raise RuntimeError(f"Failed to download utils.py. Status code: {response.status_code}")

In [2]:
# Now import and run the setup
from utils import (
    get_yaml_path,
    setup_environment,
)


setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        'adult-income',
        'adult-income_syn',
    ]
)

Obtaining file:///Users/justyn.chen/Dropbox/310_Career_%E5%B7%A5%E4%BD%9C/20231016_NICS_%E8%B3%87%E5%AE%89%E9%99%A2/41_PETsARD/petsard
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: petsard
  Building editable for petsard (pyproject.toml): started
  Building editable for petsard (pyproject.toml): finished with status 'done'
  Created wheel for petsard: filename=petsard-1.0.0-py3-none-any.whl size=6548 

In [3]:
from petsard import Executor

# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

## Case 1: Default Synthesis
情境一：預設合成

In [4]:
yaml_file_case1: str = 'default-synthesis.yaml'

yaml_path_case1: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case1,
    branch=branch,
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
Preprocessor:
  demo:
    method: 'default'
Synthesizer:
  demo:
    method: 'default' # sdv-single_table-gaussiancopula
Postprocessor:
  demo:
    method: 'default'
Reporter:
  output:
    method: 'save_data'
    output: 'result'
    source: 'Synthesizer'
...


### Execution and Result
執行與結果

In [5]:
# Initialize and run executor
exec_case1 = Executor(config=yaml_path_case1)
exec_case1.run()



Synthesizer (SDV): Fitting GaussianCopula.
Synthesizer (SDV): Fitting GaussianCopula spent 2.1609 sec.


INFO:root:age changes data dtype from float64 to int8 for metadata alignment.
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:fnlwgt changes data dtype from float64 to int32 for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:educational-num changes data dtype from float64 to int8 for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:gender changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:capi

Synthesizer (SDV): Sampling GaussianCopula # 48842 rows (same as Loader data) in 0.5427 sec.
Now is result_Loader[data]_Preprocessor[demo]_Synthesizer[demo] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Completed PETsARD execution workflow (elapsed: 0:00:04)


In [6]:
exec_case1.get_result()[
    'Loader[data]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Reporter[output]'
]['Loader[data]_Preprocessor[demo]_Synthesizer[demo]']

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,-0.089914,0.851651,0.623325,0.446855,2.129094,0.461449,0.507857,0.121504,0.531420,0.356102,-0.144804,-0.217127,0.096103,0.680793,0.534446
1,1.121258,0.638616,0.585764,0.544370,-0.104452,0.643015,0.732614,0.406424,0.245271,0.168775,-0.144804,-0.217127,-0.216195,0.786480,0.011851
2,-0.021205,0.884037,0.752404,0.767231,-1.313219,0.707065,0.785810,0.861048,0.916445,0.794554,-0.144804,-0.217127,0.068897,0.414325,0.486825
3,-0.850037,0.245680,-0.038767,0.185151,0.147001,0.391011,0.768122,0.292332,0.514725,0.304694,-0.144804,-0.217127,-0.151420,0.436926,0.749135
4,1.313946,0.655230,-1.453690,0.470895,0.611696,0.880348,0.090954,0.946271,0.811353,0.673674,-0.144804,-0.217127,0.212980,0.490961,0.682207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.899996,0.120701,-0.788104,0.705796,-0.935946,0.490961,0.764303,0.330513,0.729479,0.543267,-0.144804,-0.217127,0.360741,0.763389,0.427716
48838,-0.607098,0.809675,0.128032,0.392157,0.564407,0.731233,0.935676,0.664955,0.445275,0.927688,-0.144804,-0.217127,0.928830,0.883687,0.001289
48839,-1.266886,0.406154,-0.795331,0.105796,0.248872,0.166496,0.088820,0.535187,0.661078,0.344270,-0.144804,-0.217127,-0.006344,0.089440,0.100547
48840,0.818770,0.445396,-0.672565,0.681976,0.751209,0.466357,0.779889,0.211444,0.912063,0.146872,-0.144804,-0.217127,0.435805,0.838158,0.583783


## Case 2: Default Synthesis and Default Evaluation
情境二：預設合成與預設評測

In [23]:
yaml_file_case2: str = 'default-synthesis-default-evaluation.yaml'

yaml_path_case2: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case2,
    branch=branch,
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
Splitter:
  demo:
    num_samples: 1
    train_split_ratio: 0.8
Preprocessor:
  demo:
    method: 'default'
Synthesizer:
  demo:
    method: 'default'
Postprocessor:
  demo:
    method: 'default'
Evaluator:
  demo-diagnostic:
    method: 'sdmetrics-diagnosticreport'
  demo-quality:
    method: 'sdmetrics-qualityreport'
  demo-singlingout:
    method: 'anonymeter-singlingout'
    max_attempts: 10
  demo-linkability:
    method: 'anonymeter-linkability'
    aux_cols:
      -
        - 'age'
        - 'marital-status'
        - 'relationship'
        - 'gender'
      -
        - 'workclass'
        - 'educational-num'
        - 'occupation'
        - 'income'
  demo-inference:
    method: 'anonymeter-inference'
    secret: 'income'
  demo-classification:
    method: 'mlutility-classification'
    target: 'income'
Reporter:
  output:
    method: 'save_data'
    output: 'result'
    source: 'Synthesizer'
 

### Execution and Result
執行與結果

In [24]:
exec_case2 = Executor(config=yaml_path_case2)
exec_case2.run()



Synthesizer (SDV): Fitting GaussianCopula.
Synthesizer (SDV): Fitting GaussianCopula spent 1.6952 sec.


INFO:root:age changes data dtype from float64 to int8 for metadata alignment.
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:fnlwgt changes data dtype from float64 to int32 for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:educational-num changes data dtype from float64 to int8 for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:gender changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:capi

Synthesizer (SDV): Sampling GaussianCopula # 39073 rows (same as Splitter data) in 0.467 sec.
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 1096.80it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 879.49it/s]|
Data Structure Score: 100.0%



INFO:PETsARD.Evaluator:Completed Evaluator execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with output
INFO:PETsARD.Reporter:Starting Reporter execution


Overall Score (Average): 100.0%

Now is result_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with save_report_global
INFO:PETsARD.Reporter:Starting Reporter execution
INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Evaluator with demo-quality
INFO:PETsARD.Evaluator:Starting Evaluator execution
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata alignment.

Now is petsard[Report]_[global] save to csv...
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 107.26it/s]|
Column Shapes Score: 95.28%

(2/2) Evaluating Column Pair Trends: |          | 0/105 [00:00<?, ?it/s]|

  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).

(2/2) Evaluating Column Pair Trends: |█████▏    | 55/105 [00:00<00:00, 276.61it/s]|

  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 288.73it/s]|
Column Pair Trends Score: 61.58%



  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size(

Overall Score (Average): 78.43%

Now is result_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with save_report_global
INFO:PETsARD.Reporter:Starting Reporter execution
INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Evaluator with demo-singlingout
INFO:PETsARD.Evaluator:Starting Evaluator execution
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata alignm

Now is petsard[Report]_[global] save to csv...


  self._sanity_check()
INFO:PETsARD.Evaluator:Completed Evaluator execution (elapsed: 0:00:03)
INFO:PETsARD.Executor:Executing Reporter with output
INFO:PETsARD.Reporter:Starting Reporter execution


Now is result_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with save_report_global
INFO:PETsARD.Reporter:Starting Reporter execution
INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Evaluator with demo-linkability
INFO:PETsARD.Evaluator:Starting Evaluator execution
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata alignm

Now is petsard[Report]_[global] save to csv...


  self._sanity_check()
INFO:PETsARD.Evaluator:Completed Evaluator execution (elapsed: 0:00:02)
INFO:PETsARD.Executor:Executing Reporter with output
INFO:PETsARD.Reporter:Starting Reporter execution


Now is result_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with save_report_global
INFO:PETsARD.Reporter:Starting Reporter execution
INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Evaluator with demo-inference
INFO:PETsARD.Evaluator:Starting Evaluator execution
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata alignmen

Now is petsard[Report]_[global] save to csv...


INFO:PETsARD.Evaluator:Completed Evaluator execution (elapsed: 0:00:02)
INFO:PETsARD.Executor:Executing Reporter with output
INFO:PETsARD.Reporter:Starting Reporter execution


Now is result_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with save_report_global
INFO:PETsARD.Reporter:Starting Reporter execution
INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Evaluator with demo-classification
INFO:PETsARD.Evaluator:Starting Evaluator execution
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata ali

Now is petsard[Report]_[global] save to csv...


INFO:PETsARD.Evaluator:Completed Evaluator execution (elapsed: 0:03:45)
INFO:PETsARD.Executor:Executing Reporter with output
INFO:PETsARD.Reporter:Starting Reporter execution


Now is result_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with save_report_global
INFO:PETsARD.Reporter:Starting Reporter execution
INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Completed PETsARD execution workflow (elapsed: 0:03:57)


Now is petsard[Report]_[global] save to csv...


In [27]:
exec_case2.get_result()[
    'Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Evaluator[demo-classification]_Reporter[save_report_global]'
]['[global]']

Unnamed: 0,full_expt_name,Loader,Splitter,Preprocessor,Synthesizer,Postprocessor,Evaluator,demo-diagnostic_Score,demo-diagnostic_Data Validity,demo-diagnostic_Data Structure,...,demo-inference_attack_rate_err,demo-inference_baseline_rate,demo-inference_baseline_rate_err,demo-inference_control_rate,demo-inference_control_rate_err,demo-classification_ori_mean,demo-classification_ori_std,demo-classification_syn_mean,demo-classification_syn_std,demo-classification_diff
0,Loader[data]_Splitter[demo_[1-1]]_Preprocessor...,data,demo_[1-1],demo,demo,demo,[global],1.0,1.0,1.0,...,0.020583,0.652707,0.020844,0.676162,0.020486,0.859684,0.005887,0.776205,0.009994,-0.083479


## Case 3: External Synthesis with Default Evaluation
情境三：外部合成與預設評測

In [10]:
yaml_file_case3: str = 'external-synthesis-default-evaluation.yaml'

yaml_path_case3: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case3,
    branch=branch,
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
Synthesizer:
  custom:
    method: 'custom_data'
    filepath: 'benchmark/adult-income_syn.csv'
Evaluator:
  demo:
    method: 'default'
Reporter:
  save_report_global:
    method: 'save_report'
    output: 'evaluation'
    eval: 'demo'
    granularity: 'global'
...


### Execution and Result
執行與結果

In [None]:
exec_case3 = Executor(config=yaml_path_case3)
exec_case3.run()

In [12]:
exec_case3.get_result()[
    'Loader[data]_Synthesizer[custom]_Evaluator[demo]_Reporter[save_report_global]'
]['demo_[global]']

Unnamed: 0,full_expt_name,Loader,Synthesizer,Evaluator,demo_Score,demo_Column Shapes,demo_Column Pair Trends
result,Loader[data]_Synthesizer[custom]_Evaluatordemo...,data,custom,demo_[global],0.784236,0.952582,0.61589
