In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
from PETsARD import (
    Loader,
    Processor,
    Synthesizer,
    AutoML
)


load = Loader(
    filepath='benchmark://adult-income'
)
load.load()

Loader - Benchmarker: file benchmark/adult-income.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.


In [3]:
load.data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [21]:
ml = AutoML({'target': 'quality', 'task': 'regression'})

In [22]:
ml.create({'ori': load.data, 'syn': load.data})

In [23]:
ml.eval()

In [24]:
ml.get_global()

Unnamed: 0,Ori_mean,Ori_std,Syn_mean,Syn_std,pct_change
0,0.412246,0.074767,0.412246,0.074767,0.0


# Functional Test: import PETsARD

In [1]:
import PETsARD

# Functional Test: Module-by-Module

## Loader

In [16]:
from PETsARD import Loader


load = Loader(
    filepath='benchmark://winequality_red'
)
load.load()
print(load.data.head(1))

Loader - Benchmarker: file benchmark/winequality_red.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4               0.7          0.0             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  


## Splitter

In [17]:
from PETsARD import Splitter


split = Splitter(
    num_samples=30,
    train_split_ratio=0.1
)
split.split(data=load.data)
print(split.data[1]['train'].shape[0])
print(split.data[1]['validation'].shape[0])
print(split.data[1]['train'].head(1))
print(split.data[1]['validation'].head(1))


159
1440
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.59         0.08             4.4      0.086   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                  6.0                  29.0   0.9974  3.38        0.5   

   alcohol  quality  
0      9.0        4  
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4               0.7          0.0             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  


## Processor: transform()

In [18]:
from PETsARD import Processor

### Normal - for SDV

In [19]:
proc = Processor(
    metadata=load.metadata,
)
proc.fit(
    data=split.data[1]['train'],
)
preproc_data = proc.transform(
    data=split.data[1]['train']
)
print(preproc_data.head(1))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0      -1.269733         -0.752102    -0.560903       -0.784902   -0.20122   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0            -0.419224              -0.65025 -0.833587  0.266551  -0.553126   

    alcohol   quality  
0 -1.006923 -0.760122  


## Synthesizer

In [20]:
from PETsARD import Synthesizer

### Normal - for SDV

In [21]:
sdv_methods = [
    'sdv-single_table-gaussiancopula',
]

for synthesizing_method in sdv_methods:
    print(f"Synthesizing method: {synthesizing_method}")
    syn = Synthesizer(
        method=synthesizing_method,
        epsilon=10.0,
    )
    syn.create(data=preproc_data)
    syn.fit_sample()
    print(syn.data_syn.head(1))

Synthesizing method: sdv-single_table-gaussiancopula
Synthesizer (SDV - SingleTable): Metafile loading time: 0.0145 sec.
Synthesizer (SDV - SingleTable): Fitting GaussianCopula.




Synthesizer (SDV - SingleTable): Fitting  GaussianCopula spent 0.2822 sec.
Synthesizer (SDV - SingleTable): Sampling GaussianCopula # 118 rows (same as raw) in 0.024 sec.
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0      -0.694753         -1.120062     0.434623       -0.133717  -0.582278   

   free sulfur dioxide  total sulfur dioxide  density        pH  sulphates  \
0             0.791371              -0.60667 -0.28701  0.202093  -0.052675   

   alcohol   quality  
0  1.23219  0.767144  


  return _boost._beta_ppf(q, a, b)


## Processor: inverse_transform()

In [22]:
postproc_data = proc.inverse_transform(
    data=syn.data_syn
)
print(postproc_data.head(1))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       7.252948          0.316844     0.362156        2.336865   0.063099   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0            22.469097             24.543552  0.996443  3.331012   0.638186   

     alcohol   quality  
0  11.630709  6.200489  


In [23]:
from PETsARD import AutoML

In [24]:
ml = AutoML({'target': 'quality', 'task': 'regression', 'k': 8})

In [25]:
ml.create({'ori': load.data, 'syn': postproc_data})

In [26]:
ml.eval()

In [27]:
ml.get_global()

Unnamed: 0,Ori_mean,Ori_std,Syn_mean,Syn_std,pct_change
0,0.413081,0.084311,0.141892,0.307369,-27.118945


In [28]:
ml.ml.result_ori

{'linear_regression': [0.42173980942193723,
  0.3064532412111288,
  0.3179781851278146,
  0.3349721126605292,
  0.21567202689495102,
  0.37658471549020656,
  0.3414297414873978,
  0.4395767169591305],
 'random_forest': [0.5158899055489965,
  0.4707491390491524,
  0.4660693254346062,
  0.5490043664970591,
  0.377468253968254,
  0.548078939909297,
  0.40481440677966096,
  0.5464754745333755],
 'gradient_boosting': [0.42841832519592304,
  0.4234695050338778,
  0.3882905606321122,
  0.4474806712667623,
  0.31412209510112377,
  0.45677288417329287,
  0.3354252650032965,
  0.48700875432112545]}

In [29]:
ml.ml.result_syn

{'linear_regression': [0.22328744622346375,
  0.4982837527075503,
  0.26613477963587195,
  0.5927080477989837,
  0.18186792224100956,
  0.2331913586436517,
  0.3163042261901444,
  -0.3412657797143268],
 'random_forest': [0.09946976696355336,
  0.512880511185454,
  0.2235868630682416,
  0.4702312175100255,
  -0.1417479191070703,
  0.1602151320278502,
  0.3391998071550114,
  -0.5562779500476269],
 'gradient_boosting': [-0.059521563414053125,
  0.3399661787756334,
  0.09796035639804057,
  0.3785751308268279,
  -0.34630572626481126,
  -0.12084934693359806,
  0.4091339376046367,
  -0.3716305427330886]}