# Environment setting
環境設定

In [1]:
import os
import requests
from pathlib import Path


# determine branch, default is main
branch = "main"

# Check if running in Google Colab
is_colab = "COLAB_GPU" in os.environ

if is_colab:
    # Download the utils.py file from GitHub
    utils_url = (
        f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    )
    response = requests.get(utils_url)

    if response.status_code == 200:
        # Save the utils.py file
        with open("utils.py", "w") as f:
            f.write(response.text)

        # Create an empty __init__.py
        Path("__init__.py").touch()
    else:
        raise RuntimeError(
            f"Failed to download utils.py. Status code: {response.status_code}"
        )

In [None]:
# Now import and run the setup
from utils import (
    get_yaml_path,
    setup_environment,
)


setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        "adult-income",
    ],
)

In [3]:
from petsard import Executor

# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

## Default Synthesis and Default Evaluation
預設合成與預設評測

In [4]:
yaml_file_case: str = "default-synthesis-default-evaluation.yaml"

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/adult-income.csv'
Splitter:
  demo:
    num_samples: 1
    train_split_ratio: 0.8
Preprocessor:
  demo:
    method: 'default'
Synthesizer:
  demo:
    method: 'default'
Postprocessor:
  demo:
    method: 'default'
Evaluator:
  demo-diagnostic:
    method: 'sdmetrics-diagnosticreport'
  demo-quality:
    method: 'sdmetrics-qualityreport'
  demo-singlingout:
    method: 'anonymeter-singlingout'
  demo-linkability:
    method: 'anonymeter-linkability'
    aux_cols:
      -
        - 'age'
        - 'marital-status'
        - 'relationship'
        - 'gender'
      -
        - 'workclass'
        - 'educational-num'
        - 'occupation'
        - 'income'
  demo-inference:
    method: 'anonymeter-inference'
    secret: 'income'
  demo-classification:
    method: 'mlutility-classification'
    target: 'income'
Reporter:
  output:
    method: 'save_data'
    source: 'Synthesizer'
  save_report_global:
    method: 'save_rep

### Execution and Result
執行與結果

In [5]:
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 781.62it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 601.33it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 91.08it/s]|
Column Shapes Score: 95.2%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 264.16it/s]|
Column Pair Trends Score: 61.61%

Overall Score (Average): 78.4%

Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...


Found 1620 failed queries out of 2000. Check DEBUG messages for more details.
Reached maximum number of attempts 500000 when generating singling out queries. Returning 244 instead of the requested 2000.To avoid this, increase the number of attempts or set it to ``None`` to disable The limitation entirely.
Attack `multivariate` could generate only 244 singling out queries out of the requested 2000. This can probably lead to an underestimate of the singling out risk.


Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...


  self._sanity_check()


Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...
Now is petsard_Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo] save to csv...
Now is petsard[Report]_[global] save to csv...


In [6]:
exec_case.status.get_report()

{'Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo]':             age  workclass    fnlwgt  education  educational-num  \
 0     -0.617659   0.814321 -1.174308   0.783699        -0.218824   
 1      0.252768   0.770765 -0.592789   0.365872        -0.957753   
 2      0.452182   0.317650 -1.325093   0.387372         0.450148   
 3     -0.692329   0.241160 -0.166026   0.432247        -0.766983   
 4      1.228550   0.941172  0.657746   0.438060         1.106206   
 ...         ...        ...       ...        ...              ...   
 22832 -0.078614   0.176434  0.406921   0.351331        -0.795212   
 22833  0.785289   0.241195 -1.410539   0.232494        -0.120027   
 22834  1.986224   0.133010  0.355151   0.289892        -0.200398   
 22835 -0.104737   0.494938  0.801831   0.726470        -1.262237   
 22836 -0.318656   0.685318  0.086907   0.701775         0.594130   
 
        marital-status  occupation  relationship      race    gender  \
 0            0.401799  

In [None]:
exec_case.status.status["Evaluator"]["operator"].get_result()

{'global':    ori_mean   ori_std  syn_mean   syn_std     diff
 0   0.85367  0.005518   0.77902  0.011626 -0.07465,
 'details': {'ori': {'logistic_regression': 0.8509571092230526,
   'svc': 0.8485003582761798,
   'random_forest': 0.8522878493192753,
   'gradient_boosting': 0.8629337700890573},
  'syn': {'logistic_regression': 0.7970109530146381,
   'svc': 0.7645613675913604,
   'random_forest': 0.7763332992117924,
   'gradient_boosting': 0.778175862421947}}}

In [8]:
exec_case.get_result()[
    "Loader[data]_Splitter[demo_[1-1]]_Preprocessor[demo]_Synthesizer[demo]_Postprocessor[demo]_Evaluator[demo-classification]_Reporter[save_report_global]"
]["[global]"]

Unnamed: 0,full_expt_name,Loader,Splitter,Preprocessor,Synthesizer,Postprocessor,Evaluator,Score,Data Validity,Data Structure,...,attack_rate_err,baseline_rate,baseline_rate_err,control_rate,control_rate_err,ori_mean,ori_std,syn_mean,syn_std,diff
0,Loader[data]_Splitter[demo_[1-1]]_Preprocessor...,data,demo_[1-1],demo,demo,demo,[global],0.784044,1.0,1.0,...,0.019841,0.646718,0.020926,0.707102,0.019922,0.85367,0.005518,0.77902,0.011626,-0.07465
