# Environment setting
環境設定

In [1]:
import os
import requests
from pathlib import Path


# determine branch, default is main
branch = "main"

# Check if running in Google Colab
is_colab = "COLAB_GPU" in os.environ

if is_colab:
    # Download the utils.py file from GitHub
    utils_url = (
        f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    )
    response = requests.get(utils_url)

    if response.status_code == 200:
        # Save the utils.py file
        with open("utils.py", "w") as f:
            f.write(response.text)

        # Create an empty __init__.py
        Path("__init__.py").touch()
    else:
        raise RuntimeError(
            f"Failed to download utils.py. Status code: {response.status_code}"
        )

In [None]:
# Now import and run the setup
from utils import (
    get_yaml_path,
    setup_environment,
)


setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        "adult-income",
    ],
)

In [3]:
from petsard import Executor

# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

## Default Synthesis and Default Evaluation
預設合成與預設評測

In [4]:
yaml_file_case: str = "default-synthesis-default-evaluation.yaml"

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
Splitter:
  demo:
    num_samples: 1
    train_split_ratio: 0.8
Preprocessor:
  demo:
    method: 'default'
Synthesizer:
  demo:
    method: 'default'
Postprocessor:
  demo:
    method: 'default'
Evaluator:
  demo-diagnostic:
    method: 'sdmetrics-diagnosticreport'
  demo-quality:
    method: 'sdmetrics-qualityreport'
  demo-singlingout:
    method: 'anonymeter-singlingout'
  demo-linkability:
    method: 'anonymeter-linkability'
    aux_cols:
      -
        - 'age'
        - 'marital-status'
        - 'relationship'
        - 'gender'
      -
        - 'workclass'
        - 'educational-num'
        - 'occupation'
        - 'income'
  demo-inference:
    method: 'anonymeter-inference'
    secret: 'income'
  demo-classification:
    method: 'mlutility-classification'
    target: 'income'
Reporter:
  output:
    method: 'save_data'
    source: 'Synthesizer'
  save_report_global:
    method: 'save_rep

### Execution and Result
執行與結果

In [5]:
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 1143.19it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 404.54it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 133.81it/s]|
Column Shapes Score: 77.55%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 353.80it/s]|
Column Pair Trends Score: 66.7%

Overall Score (Average): 72.12%

Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...


Found 1631 failed queries out of 2000. Check DEBUG messages for more details.
Reached maximum number of attempts 500000 when generating singling out queries. Returning 151 instead of the requested 2000.To avoid this, increase the number of attempts or set it to ``None`` to disable The limitation entirely.
Attack `multivariate` could generate only 151 singling out queries out of the requested 2000. This can probably lead to an underestimate of the singling out risk.


Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...


  self._sanity_check()


Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...


In [6]:
exec_case.status.get_report()

{'Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo]':        age     workclass  fnlwgt     education  educational-num  \
 0       50  Never-worked  118142       5th-6th               12   
 1       29  Never-worked  235469          10th                9   
 2       27  Never-worked   75531  Some-college                9   
 3       30  Self-emp-inc  155643       7th-8th                8   
 4       37  Never-worked  197256       Masters               11   
 ...    ...           ...     ...           ...              ...   
 39068   31   Without-pay  295510  Some-college                8   
 39069   22       Private  474947  Some-college                7   
 39070   74  Never-worked  107084  Some-college               13   
 39071   46  Never-worked   58603   Prof-school               11   
 39072   34       Private   44354       5th-6th               10   
 
               marital-status         occupation   relationship  \
 0                  Separated     Prof-sp

In [7]:
exec_case.status.status["Evaluator"]["operator"].get_result()

{'global':    ori_mean  ori_std  syn_mean  syn_std  diff
 0      0.86     0.01      0.78     0.02 -0.08,
 'details': {'ori': {'logistic_regression': 0.8516736615825571,
   'svc': 0.8450199611014434,
   'random_forest': 0.8568942573446617,
   'gradient_boosting': 0.8749104309550619},
  'syn': {'logistic_regression': 0.8136963865288156,
   'svc': 0.7632306274951377,
   'random_forest': 0.7671204831610196,
   'gradient_boosting': 0.7663015661787286}}}

In [8]:
exec_case.get_result()[
    "Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Evaluator[demo-classification]_Reporter[save_report_global]"
]["[global]"]

Unnamed: 0,full_expt_name,Loader,Splitter,Preprocessor,Synthesizer,Postprocessor,Evaluator,demo-diagnostic_Score,demo-diagnostic_Data Validity,demo-diagnostic_Data Structure,...,demo-inference_attack_rate_err,demo-inference_baseline_rate,demo-inference_baseline_rate_err,demo-inference_control_rate,demo-inference_control_rate_err,demo-classification_ori_mean,demo-classification_ori_std,demo-classification_syn_mean,demo-classification_syn_std,demo-classification_diff
0,Loader[data]_Splitter[demo_[1-1]]_Preprocessor...,data,demo_[1-1],demo,demo,demo,[global],1.0,,,...,0.02,0.65,0.02,0.71,0.02,0.86,0.01,0.78,0.02,-0.08
