# Environment

In [1]:
import os
import pprint
import sys

path_petsard = os.path.dirname(os.path.dirname(os.getcwd()))
print(path_petsard)
sys.path.append(path_petsard)

pp = pprint.PrettyPrinter(depth=2)

d:\Dropbox\89_other_application\GitHub\PETsARD


## Data Loading

In [6]:
from PETsARD import (
    Loader,
    Processor,
    Synthesizer
)


load = Loader(
    filepath='benchmark://adult',
    na_values={k: '?' for k in [
        'workclass',
        'occupation',
        'native-country'
    ]}
)
load.load()

Loader - Benchmarker: file benchmark\adult.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          3     0.0          1              0.0               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              38       0  


# Executable Smartnoise check

In [16]:
import pkg_resources

import snsynth


# snsynth.__version__
# AttributeError: module 'snsynth' has no attribute '__version__'

# snsynth.__file__
# No version info incl.

package_name = 'smartnoise-synth' # = snsynth
try:
    dist = pkg_resources.get_distribution(package_name)
    print(f"Name: {dist.project_name}")
    print(f"Version: {dist.version}")
except pkg_resources.DistributionNotFound:
    print(f"Package '{package_name}' not found")
# Name: smartnoise-synth
# Version: 0.3.3

Name: smartnoise-synth
Version: 0.3.3


In [45]:
import pprint

import pandas as pd


def issue332(
        load: Loader,
        synthesizing_method: str,
        scaler_inhibit: bool = False
):
    pp = pprint.PrettyPrinter(depth=2)

    proc_discretizing = Processor(
        metadata=load.metadata,
    )

    if scaler_inhibit:
        print("Preproc config of Scaler before update as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])
        proc_discretizing.update_config(
            {'scaler': {
                col: None for col in load.data.columns
            }}
        )
        print("Preproc config of Scaler before after as ...")
        pp.pprint(proc_discretizing.get_config()['scaler'])

    proc_discretizing.fit(
        data=load.data,
        sequence=[
            'missing',
            'outlier',
            'scaler',
            'discretizing'
        ]
    )
    preproc_discretizing_data = proc_discretizing.transform(
        data=load.data
    )
    print("Preproc data as ...")
    print(preproc_discretizing_data.head(1))



    syn = Synthesizer(
        method=synthesizing_method,
        epsilon=10.0,
    )
    syn.create(data=preproc_discretizing_data)
    syn.fit_sample()
    print("Sync data as ...")
    print(syn.data_syn.head(1))

## Pure Discretizing

### 'smartnoise-aim' - aim not found

In [46]:
issue332(
    load=load,
    synthesizing_method='smartnoise-aim',
)
# ValueError: Synthesizer aim not found

Preproc data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          3     0.0          1              0.0               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              38       0  


ValueError: Synthesizer aim not found

### 'smartnoise-mwem' - MemoryError

In [47]:
issue332(
    load=load,
    synthesizing_method='smartnoise-mwem',
)
# MemoryError: Unable to allocate 5.30 TiB for an array with shape (728012759040,) and data type int64

Preproc data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          3     0.0          1              0.0               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              38       0  
Synthesizer (SmartNoise): Fitting mwem.


MemoryError: Unable to allocate 5.30 TiB for an array with shape (728012759040,) and data type int64

### 'smartnoise-mst' - No module named 'disjoint_set'

In [49]:
issue332(
    load=load,
    synthesizing_method='smartnoise-mst',
)
# Please install mbi with:
#   pip install git+https://github.com/ryan112358/private-pgm.git
# ModuleNotFoundError: No module named 'disjoint_set'

Preproc data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          3     0.0          1              0.0               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              38       0  
Please install mbi with:
   pip install git+https://github.com/ryan112358/private-pgm.git


ModuleNotFoundError: No module named 'disjoint_set'

### 'smartnoise-pacsynth' - continue columns

In [50]:
issue332(
    load=load,
    synthesizing_method='smartnoise-pacsynth',
)
# ValueError: The transformer appears to have some continuous columns. Please provide only categorical or ordinal.

Preproc data as ...
   age  workclass  fnlwgt  education  educational-num  marital-status  \
0  0.0          3     0.0          1              0.0               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1           0.0           0.0   

   hours-per-week  native-country  income  
0             0.0              38       0  
Synthesizer (SmartNoise): Fitting pacsynth.


ValueError: The transformer appears to have some continuous columns. Please provide only categorical or ordinal.

## Auto Inhibit Scaler

### 'smartnoise-pacsynth' - auto scaler inhibit

In [52]:
issue332(
    load=load,
    synthesizing_method='smartnoise-pacsynth',
    scaler_inhibit=True
)
# ValueError: The transformer appears to have some continuous columns. Please provide only categorical or ordinal.

Preproc config of Scaler before update as ...
{'age': <PETsARD.processor.scaler.ScalerStandard object at 0x0000017DC8129DE0>,
 'capital-gain': <PETsARD.processor.scaler.ScalerStandard object at 0x0000017DC812ABC0>,
 'capital-loss': <PETsARD.processor.scaler.ScalerStandard object at 0x0000017DC812ACB0>,
 'education': None,
 'educational-num': <PETsARD.processor.scaler.ScalerStandard object at 0x0000017DC8128D90>,
 'fnlwgt': <PETsARD.processor.scaler.ScalerStandard object at 0x0000017DC8129300>,
 'gender': None,
 'hours-per-week': <PETsARD.processor.scaler.ScalerStandard object at 0x0000017DC812B100>,
 'income': None,
 'marital-status': None,
 'native-country': None,
 'occupation': None,
 'race': None,
 'relationship': None,
 'workclass': None}
Preproc config of Scaler before after as ...
{'age': None,
 'capital-gain': None,
 'capital-loss': None,
 'education': None,
 'educational-num': None,
 'fnlwgt': None,
 'gender': None,
 'hours-per-week': None,
 'income': None,
 'marital-status': N

ValueError: The transformer appears to have some continuous columns. Please provide only categorical or ordinal.

## dtype problem

In [57]:
proc_discretizing = Processor(
    metadata=load.metadata,
)
# proc_discretizing.update_config(
#     {'scaler': {
#         col: None for col in load.data.columns
#     }}
# )
proc_discretizing.fit(
    data=load.data,
    sequence=[
        'missing',
        'outlier',
        'scaler',
        'discretizing'
    ]
)
preproc_discretizing_data = proc_discretizing.transform(
    data=load.data
)

pp.pprint(preproc_discretizing_data.dtypes)

age                float64
workclass            int32
fnlwgt             float64
education            int32
educational-num    float64
marital-status       int32
occupation           int32
relationship         int32
race                 int32
gender               int32
capital-gain       float64
capital-loss       float64
hours-per-week     float64
native-country       int32
income               int32
dtype: object
