# Environment setting
環境設定

In [1]:
import os
import requests
from pathlib import Path


# determine branch, default is main
branch = 'main'

# Check if running in Google Colab
is_colab = 'COLAB_GPU' in os.environ

if is_colab:
    # Download the utils.py file from GitHub
    utils_url = f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    response = requests.get(utils_url)

    if response.status_code == 200:
        # Save the utils.py file
        with open('utils.py', 'w') as f:
            f.write(response.text)

        # Create an empty __init__.py
        Path('__init__.py').touch()
    else:
        raise RuntimeError(f"Failed to download utils.py. Status code: {response.status_code}")

In [2]:
# Now import and run the setup
from utils import (
    get_yaml_path,
    setup_environment,
)


setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        'adult-income',
    ]
)

Obtaining file:///Users/justyn.chen/Dropbox/310_Career_%E5%B7%A5%E4%BD%9C/20231016_NICS_%E8%B3%87%E5%AE%89%E9%99%A2/41_PETsARD/petsard
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: petsard
  Building editable for petsard (pyproject.toml): started
  Building editable for petsard (pyproject.toml): finished with status 'done'
  Created wheel for petsard: filename=petsard-1.0.0-py3-none-any.whl size=6548 

In [3]:
from petsard import Executor

# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

## Comparing Synthesizers
比較合成演算法

In [4]:
yaml_file_case: str = 'comparing-synthesizers.yaml'

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
Preprocessor:
  demo:
    method: 'default'
Synthesizer:
  gaussian-copula:
    method: 'sdv-single_table-gaussiancopula'
  ctgan:
    method: 'sdv-single_table-ctgan'
  tvae:
    method: 'sdv-single_table-tvae'
Postprocessor:
  demo:
    method: 'default'
Evaluator:
  demo-quality:
    method: 'sdmetrics-qualityreport'
Reporter:
  output:
    method: 'save_data'
    source: 'Synthesizer'
  save_report_global:
    method: 'save_report'
    granularity: 'global'
...


### Execution and Result
執行與結果

In [5]:
# Initialize and run executor
exec_case = Executor(config=yaml_path_case)
exec_case.run()



Synthesizer (SDV): Fitting GaussianCopula.
Synthesizer (SDV): Fitting GaussianCopula spent 1.9517 sec.


INFO:root:age changes data dtype from float64 to int8 for metadata alignment.
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:fnlwgt changes data dtype from float64 to int32 for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:educational-num changes data dtype from float64 to int8 for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:gender changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:capi

Synthesizer (SDV): Sampling GaussianCopula # 48842 rows (same as Loader data) in 0.5241 sec.
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 93.19it/s]|
Column Shapes Score: 95.4%

(2/2) Evaluating Column Pair Trends: |          | 0/105 [00:00<?, ?it/s]|

  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(


(2/2) Evaluating Column Pair Trends: |█████▏    | 54/105 [00:00<00:00, 267.76it/s]|

  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 279.13it/s]|
Column Pair Trends Score: 60.69%



  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).

Overall Score (Average): 78.05%

Now is petsard_Loader[data]_Preprocessor[demo]_Synthesizer[gaussian-copula] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with save_report_global
INFO:PETsARD.Reporter:Starting Reporter execution
INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Synthesizer with ctgan
INFO:PETsARD.Synthesizer:Starting Synthesizer execution
INFO:SingleTableSynthesizer:{'EVENT': 'Instance', 'TIMESTAMP': datetime.datetime(2025, 2, 2, 22, 31, 26, 284633), 'SYNTHESIZER CLASS NAME': 'CTGANSynthesizer', 'SYNTHESIZER ID': 'CTGANSynthesizer_1.17.4_669aaaabc1b94ad79d089760b1570564'}
INFO:SingleTableSynthesizer:{'EVENT': 'Fit', 'TIMESTAMP': datetime.datetime(2025, 2, 2, 22, 31, 26, 285001), 'SYNTHESIZER CLASS NAME': 'CTGANSynthesizer', 'SYNTHESIZER ID': 'CTGANSynthesizer_1.17.4_669aaaabc1b94ad79d089760b1570564', 'TOTAL NUMBER OF TABLES': 1, 'TOTAL NUMBER OF ROWS': 28558, 'TOTAL NUMBER OF COLUMNS': 15}
INFO:sdv.data_processing.data_processor:Fitting table  metadata
INFO:sdv

Now is petsard[Report]_[global] save to csv...
Synthesizer (SDV): Fitting CTGAN.


INFO:rdt.transformers.utils:No rounding scheme detected for column 'education'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'educational-num'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'marital-status'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'occupation'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'relationship'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'race'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'gender'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'capital-gain'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'capital-loss'. Data will not be rounded.
INFO:rdt.transformers.utils:No ro

Synthesizer (SDV): Fitting CTGAN spent 253.9789 sec.


INFO:SingleTableSynthesizer:{'EVENT': 'Sample', 'TIMESTAMP': datetime.datetime(2025, 2, 2, 22, 35, 40, 263985), 'SYNTHESIZER CLASS NAME': 'CTGANSynthesizer', 'SYNTHESIZER ID': 'CTGANSynthesizer_1.17.4_669aaaabc1b94ad79d089760b1570564', 'TOTAL NUMBER OF TABLES': 1, 'TOTAL NUMBER OF ROWS': 48842, 'TOTAL NUMBER OF COLUMNS': 15}
INFO:PETsARD.Synthesizer:Completed Synthesizer execution (elapsed: 0:04:14)
INFO:PETsARD.Executor:Executing Postprocessor with demo
INFO:PETsARD.Postprocessor:Starting Postprocessor execution
INFO:PETsARD.Processor:MediatorEncoder is created.
INFO:PETsARD.Processor:MediatorScaler is created.
INFO:PETsARD.Processor:MediatorScaler transformation done.
INFO:PETsARD.Processor:str inverse transformation done.
INFO:PETsARD.Processor:MediatorEncoder transformation done.
INFO:PETsARD.Processor:str inverse transformation done.
INFO:PETsARD.Processor:str inverse transformation done.
INFO:root:age changes data dtype from float64 to int8 for metadata alignment.
INFO:root:workc

Synthesizer (SDV): Sampling CTGAN # 48842 rows (same as Loader data) in 0.3445 sec.
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 89.08it/s]|
Column Shapes Score: 91.99%

(2/2) Evaluating Column Pair Trends: |          | 0/105 [00:00<?, ?it/s]|

  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(


(2/2) Evaluating Column Pair Trends: |████▊     | 50/105 [00:00<00:00, 253.19it/s]|

  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 268.59it/s]|
Column Pair Trends Score: 56.69%



  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size(

Overall Score (Average): 74.34%

Now is petsard_Loader[data]_Preprocessor[demo]_Synthesizer[ctgan] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with save_report_global
INFO:PETsARD.Reporter:Starting Reporter execution
INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Synthesizer with tvae
INFO:PETsARD.Synthesizer:Starting Synthesizer execution
INFO:SingleTableSynthesizer:{'EVENT': 'Instance', 'TIMESTAMP': datetime.datetime(2025, 2, 2, 22, 35, 41, 672468), 'SYNTHESIZER CLASS NAME': 'TVAESynthesizer', 'SYNTHESIZER ID': 'TVAESynthesizer_1.17.4_a060cd1b17044fb2a7e85ddee1f77234'}
INFO:SingleTableSynthesizer:{'EVENT': 'Fit', 'TIMESTAMP': datetime.datetime(2025, 2, 2, 22, 35, 41, 672761), 'SYNTHESIZER CLASS NAME': 'TVAESynthesizer', 'SYNTHESIZER ID': 'TVAESynthesizer_1.17.4_a060cd1b17044fb2a7e85ddee1f77234', 'TOTAL NUMBER OF TABLES': 1, 'TOTAL NUMBER OF ROWS': 28558, 'TOTAL NUMBER OF COLUMNS': 15}
INFO:sdv.data_processing.data_processor:Fitting table  metadata
INFO:sdv.data

Now is petsard[Report]_[global] save to csv...
Synthesizer (SDV): Fitting TVAE.


INFO:rdt.transformers.utils:No rounding scheme detected for column 'occupation'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'relationship'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'race'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'gender'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'capital-gain'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'capital-loss'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'hours-per-week'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'native-country'. Data will not be rounded.
INFO:rdt.transformers.utils:No rounding scheme detected for column 'income'. Data will not be rounded.
INFO:SingleTableSynthesizer:{'EVENT':

Synthesizer (SDV): Fitting TVAE spent 101.8508 sec.
Synthesizer (SDV): Sampling TVAE # 48842 rows (same as Loader data) in 0.1932 sec.


INFO:PETsARD.Processor:str inverse transformation done.
INFO:PETsARD.Processor:str inverse transformation done.
INFO:root:age changes data dtype from float64 to int8 for metadata alignment.
INFO:root:workclass changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:fnlwgt changes data dtype from float64 to int32 for metadata alignment.
INFO:root:education changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:educational-num changes data dtype from float64 to int8 for metadata alignment.
INFO:root:marital-status changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:occupation changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:relationship changes data dtype from category[object] to category[object] for metadata alignment.
INFO:root:race changes data dtype from category[object] to category[object] for metadata alignment.
INFO

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 122.54it/s]|
Column Shapes Score: 81.97%

(2/2) Evaluating Column Pair Trends: |          | 0/105 [00:00<?, ?it/s]|

  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).

(2/2) Evaluating Column Pair Trends: |████▊     | 51/105 [00:00<00:00, 258.29it/s]|

  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size(

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 263.74it/s]|
Column Pair Trends Score: 45.91%



  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
  contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
  contingency_synthetic = synthetic.groupby(list(columns), dropna=False).

Overall Score (Average): 63.94%



INFO:PETsARD.Reporter:Starting Reporter execution


Now is petsard_Loader[data]_Preprocessor[demo]_Synthesizer[tvae] save to csv...


INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Executing Reporter with save_report_global
INFO:PETsARD.Reporter:Starting Reporter execution
INFO:PETsARD.Reporter:Completed Reporter execution (elapsed: 0:00:00)
INFO:PETsARD.Executor:Completed PETsARD execution workflow (elapsed: 0:06:02)


Now is petsard[Report]_[global] save to csv...


In [9]:
exec_case.get_result()[
    'Loader[data]_Preprocessor[demo]_Synthesizer[tvae]_Postprocessor[demo]_Evaluator[demo-quality]_Reporter[save_report_global]'
]['[global]']

Unnamed: 0,full_expt_name,Loader,Preprocessor,Synthesizer,Postprocessor,Evaluator,demo-quality_Score,demo-quality_Column Shapes,demo-quality_Column Pair Trends
0,Loader[data]_Preprocessor[demo]_Synthesizer[ct...,data,demo,ctgan,demo,[global],0.743363,0.919852,0.566874
1,Loader[data]_Preprocessor[demo]_Synthesizer[ga...,data,demo,gaussian-copula,demo,[global],0.780469,0.954044,0.606894
2,Loader[data]_Preprocessor[demo]_Synthesizer[tv...,data,demo,tvae,demo,[global],0.639404,0.819683,0.459126
