In [1]:
import pandas as pd
import numpy as np

In [2]:
%cd /Users/alex/PETsARD

/Users/alex/PETsARD


In [26]:
from PETsARD import (
    Loader,
    Processor,
    Synthesizer
)


load = Loader(
    filepath='benchmark://adult',
    na_values={k: '?' for k in [
        'workclass',
        'occupation',
        'native-country'
    ]}
)
load.load()

Loader - Benchmarker: file benchmark/adult.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.


In [27]:
load.data = load.data.loc[:1000, :]

In [28]:
import pprint


def issue332(
        load: Loader,
        synthesizing_method: str,
        scaler_inhibit: bool = False
):
    pp = pprint.PrettyPrinter(depth=2)

    proc_discretizing = Processor(
        metadata=load.metadata,
    )

    if scaler_inhibit:
        print("Preproc config of Scaler before update as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])
        proc_discretizing.update_config(
            {'scaler': {
                col: None for col in load.data.columns
            }}
        )
        print("Preproc config of Scaler before after as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])

    proc_discretizing.fit(
        data=load.data,
        sequence=[
            'missing',
            'outlier',
            'scaler',
            'discretizing'
        ]
    )
    preproc_discretizing_data = proc_discretizing.transform(
        data=load.data
    )
    print("Preproc data as ...")
    print(preproc_discretizing_data.head(1))


    syn = Synthesizer(
        method=synthesizing_method,
        epsilon=10.0,
    )
    syn.create(data=preproc_discretizing_data)
    syn.fit_sample()
    print("Sync data as ...")
    print(syn.data_syn.head(1))
    postproc_discretizing_data = proc_discretizing.inverse_transform(
        data=syn.data_syn
    )
    print(postproc_discretizing_data.head(1))

In [24]:
import pprint


def issue332_gan(
        load: Loader,
        synthesizing_method: str,
        scaler_inhibit: bool = False
):
    pp = pprint.PrettyPrinter(depth=2)

    proc_discretizing = Processor(
        metadata=load.metadata,
    )

    if scaler_inhibit:
        print("Preproc config of Scaler before update as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])
        proc_discretizing.update_config(
            {'scaler': {
                col: None for col in load.data.columns
            }}
        )
        print("Preproc config of Scaler before after as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])

    proc_discretizing.fit(
        data=load.data,
        sequence=[
            'missing',
            'outlier',
            'encoder',
            'scaler'
        ]
    )
    preproc_discretizing_data = proc_discretizing.transform(
        data=load.data
    )
    print("Preproc data as ...")
    print(preproc_discretizing_data.head(1))


    syn = Synthesizer(
        method=synthesizing_method,
        epsilon=10.0,
    )
    syn.create(data=preproc_discretizing_data)
    syn.fit_sample()
    print("Sync data as ...")
    print(syn.data_syn.head(1))
    postproc_discretizing_data = proc_discretizing.inverse_transform(
        data=syn.data_syn
    )
    print(postproc_discretizing_data.head(1))

In [29]:
issue332(
    load=load,
    synthesizing_method='smartnoise-aim'
)

Preproc data as ...
   age  workclass fnlwgt  education educational-num  marital-status  \
0  0.0          2    0.0          1             0.0               4   

   occupation  relationship  race  gender capital-gain capital-loss  \
0           6             3     2       1          0.0          0.0   

  hours-per-week  native-country  income  
0            0.0              25       0  
Synthesizer (SmartNoise): Fitting aim.
15
Initial Sigma 11.056041228448832




Synthesizer (SmartNoise): Fitting  aim spent 103.6998 sec.
Synthesizer (SmartNoise): Sampling aim # 558 rows (same as raw) in 0.0282 sec.
Sync data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          2     0.0          7              0.0               2   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           4             0     4       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              25       1  
          age workclass        fnlwgt   education  educational-num  \
0  371.264583   Private  8.479563e+09  Assoc-acdm         16.57591   

       marital-status       occupation relationship   race gender  \
0  Married-civ-spouse  Farming-fishing      Husband  White   Male   

   capital-gain  capital-loss  hours-per-week native-country income  
0  9.268677e+07  96578.239577       172.18284  United-States   >50K  


上述執行會出現 `RuntimeError: all elements of input should be between 0 and 1`，原因是 input 中含有 NA 值，來自於前處理過程中將離群值去除掉，導致 `capital-gain`, `capital-loss` 兩個欄位的值皆相同，在 `smartnoise` 中進行 `MinMaxTransformer` 時會讓運算結果為 NA （分母為 0）。因此針對 adult 資料集，將其中的 `outlier` 前處理拿掉，就可以正常運行了。