In [1]:
import pandas as pd
import numpy as np

In [2]:
# %cd /Users/alex/PETsARD

import os
import sys

path_petsard = os.path.dirname(os.path.dirname(os.getcwd()))
print(path_petsard)
sys.path.append(path_petsard)

d:\Dropbox\89_other_application\GitHub\PETsARD


In [3]:
from PETsARD import (
    Loader,
    Processor,
    Synthesizer
)


load = Loader(
    filepath='benchmark://adult-income',
    na_values={k: '?' for k in [
        'workclass',
        'occupation',
        'native-country'
    ]}
)
load.load()

Loader - Benchmarker : Success download the benchmark dataset from https://petsard-benchmark.s3.amazonaws.com/adult-income.csv.


In [4]:
load.data = load.data.loc[:1000, :]

In [5]:
import pprint


def issue332(
        load: Loader,
        synthesizing_method: str,
        scaler_inhibit: bool = False
):
    pp = pprint.PrettyPrinter(depth=2)

    proc_discretizing = Processor(
        metadata=load.metadata,
    )

    if scaler_inhibit:
        print("Preproc config of Scaler before update as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])
        proc_discretizing.update_config(
            {'scaler': {
                col: None for col in load.data.columns
            }}
        )
        print("Preproc config of Scaler before after as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])

    proc_discretizing.fit(
        data=load.data,
        sequence=[
            'missing',
            'outlier',
            'scaler',
            'discretizing'
        ]
    )
    preproc_discretizing_data = proc_discretizing.transform(
        data=load.data
    )
    print("Preproc data as ...")
    print(preproc_discretizing_data.head(1))


    syn = Synthesizer(
        method=synthesizing_method,
        epsilon=10.0,
    )
    syn.create(data=preproc_discretizing_data)
    syn.fit_sample()
    print("Sync data as ...")
    print(syn.data_syn.head(1))
    postproc_discretizing_data = proc_discretizing.inverse_transform(
        data=syn.data_syn
    )
    print(postproc_discretizing_data.head(1))

In [7]:
import pprint


def issue332_gan(
        load: Loader,
        synthesizing_method: str,
        scaler_inhibit: bool = False,
        outlier_inhibit: bool = False
):
    pp = pprint.PrettyPrinter(depth=2)

    proc_discretizing = Processor(
        metadata=load.metadata,
    )

    if scaler_inhibit:
        print("Preproc config of Scaler before update as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])
        proc_discretizing.update_config(
            {'scaler': {
                col: None for col in load.data.columns
            }}
        )
        print("Preproc config of Scaler before after as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])

    if outlier_inhibit:
        print("Preproc config of Outlier before update as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])
        proc_discretizing.update_config(
            {'outlier': {
                col: None for col in load.data.columns
            }}
        )
        print("Preproc config of Outlier before after as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])


    proc_discretizing.fit(
        data=load.data,
        sequence=[
            'missing',
            'outlier',
            'encoder',
            'scaler'
        ]
    )
    preproc_discretizing_data = proc_discretizing.transform(
        data=load.data
    )
    print("Preproc data as ...")
    print(preproc_discretizing_data.head(1))


    syn = Synthesizer(
        method=synthesizing_method,
        epsilon=10.0,
    )
    syn.create(data=preproc_discretizing_data)
    syn.fit_sample()
    print("Sync data as ...")
    print(syn.data_syn.head(1))
    postproc_discretizing_data = proc_discretizing.inverse_transform(
        data=syn.data_syn
    )
    print(postproc_discretizing_data.head(1))

In [8]:
issue332(
    load=load,
    synthesizing_method='smartnoise-aim'
)

Preproc data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          2     0.0          1              0.0               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              25       0  




Synthesizer (SmartNoise): Fitting aim.
15
Initial Sigma 11.056041228448832
Synthesizer (SmartNoise): Fitting  aim spent 270.8398 sec.
Synthesizer (SmartNoise): Sampling aim # 558 rows (same as raw) in 0.5135 sec.
Sync data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          2     0.0          4              0.0               2   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             0     4       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              25       1  
          age workclass        fnlwgt education  educational-num  \
0  371.264583   Private  8.479563e+09   5th-6th         16.57591   

       marital-status         occupation relationship   race gender  \
0  Married-civ-spouse  Machine-op-inspct      Husband  White   Male   

   capital-gain  capital-loss  hours-per-week native-country income  
0  9.268677e+07  96578.239577       17

上述執行會出現 `RuntimeError: all elements of input should be between 0 and 1`，原因是 input 中含有 NA 值，來自於前處理過程中將離群值去除掉，導致 `capital-gain`, `capital-loss` 兩個欄位的值皆相同，在 `smartnoise` 中進行 `MinMaxTransformer` 時會讓運算結果為 NA （分母為 0）。因此針對 adult 資料集，將其中的 `outlier` 前處理拿掉，就可以正常運行了。

In [9]:
issue332(
    load=load,
    synthesizing_method='smartnoise-mst'
)

Preproc data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          2     0.0          1              0.0               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              25       0  
Synthesizer (SmartNoise): Fitting mst.
Synthesizer (SmartNoise): Fitting  mst spent 113.5179 sec.
Synthesizer (SmartNoise): Sampling mst # 558 rows (same as raw) in 0.4945 sec.
Sync data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          0     0.0          9              0.0               2   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0          11             0     4       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              25       0  
          age    workc

 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [10]:
issue332(
    load=load,
    synthesizing_method='smartnoise-pacsynth'
)

Preproc data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          2     0.0          1              0.0               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              25       0  
Synthesizer (SmartNoise): Fitting pacsynth.
Synthesizer (SmartNoise): Fitting  pacsynth spent 0.2158 sec.
Synthesizer (SmartNoise): Sampling pacsynth # 558 rows (same as raw) in 1.6921 sec.
Sync data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0        2.0     0.0       11.0              0.0             2.0   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0         6.0           0.0   4.0     1.0           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0            25.0     0.0  


ValueError: Input contains NaN.

In [12]:
issue332_gan(
        load = load,
        synthesizing_method = 'smartnoise-dpctgan',
        outlier_inhibit = True
)

Preproc config of Outlier before update as ...
{'age': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F484229F90>,
 'capital-gain': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F48422B3D0>,
 'capital-loss': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F4842384F0>,
 'education': None,
 'educational-num': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F484229F60>,
 'fnlwgt': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F484228C70>,
 'gender': None,
 'hours-per-week': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F4842385B0>,
 'income': None,
 'marital-status': None,
 'native-country': None,
 'occupation': None,
 'race': None,
 'relationship': None,
 'workclass': None}
Preproc config of Outlier before after as ...
{'age': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F484229F90>,
 'capital-gain': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F48422B3D0>,
 'capital-loss': <PETsARD.proce



Epoch 1, Loss G: 0.7107, Loss D: 1.3900
epsilon is 0.08246410416899833, alpha is 63.0
Epoch 2, Loss G: 0.7097, Loss D: 1.3898
epsilon is 0.6154321240229542, alpha is 21.0
Epoch 3, Loss G: 0.7048, Loss D: 1.3908
epsilon is 0.8717652885709554, alpha is 16.0
Epoch 4, Loss G: 0.7016, Loss D: 1.3897
epsilon is 1.0734433928301887, alpha is 14.0
Epoch 5, Loss G: 0.7059, Loss D: 1.3874
epsilon is 1.2474424384805656, alpha is 12.0
Epoch 6, Loss G: 0.7015, Loss D: 1.3895
epsilon is 1.403078289939324, alpha is 10.9
Epoch 7, Loss G: 0.6987, Loss D: 1.3876
epsilon is 1.5430774752709655, alpha is 10.6
Epoch 8, Loss G: 0.6922, Loss D: 1.3928
epsilon is 1.6748565545536702, alpha is 10.0
Epoch 9, Loss G: 0.6947, Loss D: 1.3898
epsilon is 1.7986930762464894, alpha is 9.4
Epoch 10, Loss G: 0.6898, Loss D: 1.3850
epsilon is 1.9160035051686128, alpha is 9.0
Epoch 11, Loss G: 0.6888, Loss D: 1.3885
epsilon is 2.027907968415127, alpha is 8.7
Epoch 12, Loss G: 0.6854, Loss D: 1.3891
epsilon is 2.1351217811139

In [13]:
issue332_gan(
        load = load,
        synthesizing_method = 'smartnoise-patectgan',
        outlier_inhibit = True
)

Preproc config of Outlier before update as ...
{'age': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F485D4CB20>,
 'capital-gain': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F485D4C100>,
 'capital-loss': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F485D4C2B0>,
 'education': None,
 'educational-num': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F485D4DFF0>,
 'fnlwgt': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F485D4D480>,
 'gender': None,
 'hours-per-week': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F485D4C670>,
 'income': None,
 'marital-status': None,
 'native-country': None,
 'occupation': None,
 'race': None,
 'relationship': None,
 'workclass': None}
Preproc config of Outlier before after as ...
{'age': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F485D4CB20>,
 'capital-gain': <PETsARD.processor.scaler.ScalerStandard object at 0x000001F485D4C100>,
 'capital-loss': <PETsARD.proce