In [1]:
import os
import pprint
import sys


path_cwd = os.getcwd()
print(path_cwd)
path_petsard = os.path.dirname(os.path.dirname(path_cwd))
print(path_petsard)
sys.path.append(path_petsard)

pp = pprint.PrettyPrinter(depth=2)

d:\Dropbox\89_other_application\GitHub\PETsARD\demo\dev
d:\Dropbox\89_other_application\GitHub\PETsARD


In [2]:
from PETsARD import (
    Loader,
    Splitter,
    Processor,
    Synthesizer,
    Describer,
)


load = Loader(
    filepath='benchmark://nist-national2018',
    na_values='N',
    column_types={
        'category': [
            'PUMA',
            'SEX',
            'MSP',
            'HISP',
            'RAC1P',
            'NOC',
            'NPF',
            'HOUSING_TYPE',
            'OWN_RENT',
            'INDP',
            'INDP_CAT',
            'EDU',
            'PINCP_DECILE',
            'DVET',
            'DREM',
            'DPHY',
            'DEYE',
            'DEAR',
            'PWGTP',
            'WGTP',
        ],
    },
)
load.load()
print(f"Raw data (Loader) # rn = {load.data.shape}")

split = Splitter(train_split_ratio=0.8, random_state=42)
split.split(data=load.data, metadata=load.metadata)
print(f"Split data (Splitter) # rn = {split.data[1]['train'].shape}")

proc = Processor(metadata=split.metadata)
proc.fit(data=split.data[1]['train'])
preproc_data = proc.transform(data=split.data[1]['train'])
print(f"Preproc data (Processor) # rn = {preproc_data.shape}")

syn = Synthesizer(
    method='smartnoise-aim',
    epsilon=0.3,
)
syn.create(data=preproc_data, metadata=proc._metadata)
syn.fit_sample()
print(f"Syn data (Synthesizer) by Preproc data w/ Preproc Metadata # rn = {syn.data_syn.shape}")

postproc_data = proc.inverse_transform(data=syn.data_syn)
print(f"Postproc data (Processor) # rn = {postproc_data.shape}")

Loader - Benchmarker: file benchmark\national2018.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
Raw data (Loader) # rn = (27111, 24)
Split data (Splitter) # rn = (21688, 24)
Preproc data (Processor) # rn = (100, 24)
Synthesizer (SmartNoise): Fitting aim.
24
Initial Sigma 380.0813430149839
Synthesizer (SmartNoise): Fitting aim spent 193.5992 sec.
Synthesizer (SmartNoise): Sampling aim # 100 rows (same as input data) in 0.3518 sec.
Syn data (Synthesizer) by Preproc data w/ Preproc Metadata # rn = (100, 24)
Postproc data (Processor) # rn = (100, 24)


AttributeError: 'Describer' object has no attribute 'create'

In [8]:
syn.sample(sample_num_rows=5000)

Synthesizer (SmartNoise): Sampling aim # 5000 rows (same as manual input) in 0.108 sec.


In [17]:
import yaml

from PETsARD import Executor


yaml_text: str = """---
Loader:
    national2018:
        filepath: 'benchmark://nist-national2018'
        na_values:
            'N'
        column_types:
            category:
                - PUMA
                - SEX
                - MSP
                - HISP
                - RAC1P
                - NOC
                - NPF
                - HOUSING_TYPE
                - OWN_RENT
                - INDP
                - INDP_CAT
                - EDU
                - PINCP_DECILE
                - DVET
                - DREM
                - DPHY
                - DEYE
                - DEAR
                - PWGTP
                - WGTP
Splitter:
    p0.8:
        train_split_ratio: 0.8
        random_state: 123
Preprocessor:
    encoder_label:
        encoder:
            PUMA: encoder_label
            SEX: encoder_label
            MSP: encoder_label
            HISP: encoder_label
            RAC1P: encoder_label
            NOC: encoder_label
            NPF: encoder_label
            HOUSING_TYPE: encoder_label
            OWN_RENT: encoder_label
            INDP: encoder_label
            INDP_CAT: encoder_label
            EDU: encoder_label
            PINCP_DECILE: encoder_label
            DVET: encoder_label
            DREM: encoder_label
            DPHY: encoder_label
            DEYE: encoder_label
            DEAR: encoder_label
            PWGTP: encoder_label
            WGTP: encoder_label
        sequence:
            - 'missing'
            - 'outlier'
            - 'encoder'
            - 'scaler'
Synthesizer:
    default:
        method: default
        sample_num_rows: 5000

Postprocessor:
    default:
        method: 'default'
..."""

# pp.pprint(yaml_text)

cfg = yaml.safe_load(yaml_text)
pp.pprint(cfg)

yaml_file: str = f'temp_eval.yaml'
with open(yaml_file, 'w') as f:
    f.write(yaml_text)

exec = Executor(config=yaml_file)
exec.run()

os.remove(yaml_file)

print(exec.status.status['Synthesizer']['operator'].get_result().shape)

{'Loader': {'national2018': {...}},
 'Postprocessor': {'default': {...}},
 'Preprocessor': {'encoder_label': {...}},
 'Splitter': {'p0.8': {...}},
 'Synthesizer': {'default': {...}}}
Now is Loader with national2018...
Loader - Benchmarker: file benchmark\national2018.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
Now is Splitter with p0.8_[1-1]...
Now is Preprocessor with encoder_label...
Now is Synthesizer with default...
Synthesizer (SDV): Fitting GaussianCopula.
Synthesizer (SDV): Fitting GaussianCopula spent 1.6175 sec.
Synthesizer (SDV): Sampling GaussianCopula # 21688 rows (same as Splitter data) in 2.1073 sec.
Now is Postprocessor with default...
(21688, 24)


In [20]:
import yaml

from PETsARD import Executor


yaml_text: str = """---
Loader:
    national2018:
        filepath: 'benchmark://nist-national2018'
        na_values:
            'N'
        column_types:
            category:
                - PUMA
                - SEX
                - MSP
                - HISP
                - RAC1P
                - NOC
                - NPF
                - HOUSING_TYPE
                - OWN_RENT
                - INDP
                - INDP_CAT
                - EDU
                - PINCP_DECILE
                - DVET
                - DREM
                - DPHY
                - DEYE
                - DEAR
                - PWGTP
                - WGTP
Splitter:
    p0.8:
        train_split_ratio: 0.8
        random_state: 123
Preprocessor:
    encoder_label:
        encoder:
            PUMA: encoder_label
            SEX: encoder_label
            MSP: encoder_label
            HISP: encoder_label
            RAC1P: encoder_label
            NOC: encoder_label
            NPF: encoder_label
            HOUSING_TYPE: encoder_label
            OWN_RENT: encoder_label
            INDP: encoder_label
            INDP_CAT: encoder_label
            EDU: encoder_label
            PINCP_DECILE: encoder_label
            DVET: encoder_label
            DREM: encoder_label
            DPHY: encoder_label
            DEYE: encoder_label
            DEAR: encoder_label
            PWGTP: encoder_label
            WGTP: encoder_label
        sequence:
            - 'missing'
            - 'outlier'
            - 'encoder'
            - 'scaler'
Synthesizer:
    aim_5.0:
        method: 'smartnoise-aim'
        epsilon: 0.3
        sample_num_rows: 5000

Postprocessor:
    default:
        method: 'default'
..."""

# pp.pprint(yaml_text)

cfg = yaml.safe_load(yaml_text)
pp.pprint(cfg)

yaml_file: str = f'temp_eval.yaml'
with open(yaml_file, 'w') as f:
    f.write(yaml_text)

exec = Executor(config=yaml_file)
exec.run()

os.remove(yaml_file)

print(exec.status.status['Synthesizer']['operator'].get_result().shape)

{'Loader': {'national2018': {...}},
 'Postprocessor': {'default': {...}},
 'Preprocessor': {'encoder_label': {...}},
 'Splitter': {'p0.8': {...}},
 'Synthesizer': {'aim_5.0': {...}}}
Now is Loader with national2018...
Loader - Benchmarker: file benchmark\national2018.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
Now is Splitter with p0.8_[1-1]...
Now is Preprocessor with encoder_label...
Now is Synthesizer with aim_5.0...
Synthesizer (SmartNoise): Fitting aim.
24
Initial Sigma 380.0813430149839
Synthesizer (SmartNoise): Fitting aim spent 193.235 sec.
Synthesizer (SmartNoise): Sampling aim # 104 rows (same as input data) in 0.0199 sec.
Now is Postprocessor with default...
(104, 24)


(21688, 24)
