In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
from PETsARD import (
    Loader,
    Processor,
    Synthesizer,
    AutoML
)


load = Loader(
    filepath='benchmark://adult-income'
)
load.load()

Loader - Benchmarker: file benchmark/adult-income.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.


In [3]:
load.data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [21]:
ml = AutoML({'target': 'quality', 'task': 'regression'})

In [22]:
ml.create({'ori': load.data, 'syn': load.data})

In [23]:
ml.eval()

In [24]:
ml.get_global()

Unnamed: 0,Ori_mean,Ori_std,Syn_mean,Syn_std,pct_change
0,0.412246,0.074767,0.412246,0.074767,0.0


# Functional Test: import PETsARD

In [1]:
import PETsARD

# Functional Test: Module-by-Module

## Loader

In [1]:
from PETsARD import Loader


load = Loader(
    filepath='benchmark://winequality_red'
)
load.load()
print(load.data.head(1))

Loader - Benchmarker: file benchmark/winequality_red.csv already exist and match SHA-256.
                      PETsARD will ignore download and use local data directly.
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4               0.7          0.0             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  


## Splitter

In [2]:
from PETsARD import Splitter


split = Splitter(
    num_samples=30,
    train_split_ratio=0.1
)
split.split(data=load.data)
print(split.data[1]['train'].shape[0])
print(split.data[1]['validation'].shape[0])
print(split.data[1]['train'].head(1))
print(split.data[1]['validation'].head(1))


159
1440
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.8              0.88          0.0             2.6      0.098   

   free sulfur dioxide  total sulfur dioxide  density   pH  sulphates  \
0                 25.0                  67.0   0.9968  3.2       0.68   

   alcohol  quality  
0      9.8        5  
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4               0.7          0.0             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  


## Processor: transform()

In [3]:
from PETsARD import Processor

### Normal - for SDV

In [4]:
proc = Processor(
    metadata=load.metadata,
)
proc.fit(
    data=split.data[1]['train'],
)
preproc_data = proc.transform(
    data=split.data[1]['train']
)
print(preproc_data.head(1))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0      -0.268691          1.932518    -1.326988        0.034392   0.048582   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0             0.907264              0.519488  0.064426 -0.678962   0.108339   

   alcohol   quality  
0 -0.54329 -0.643415  


## Synthesizer

In [5]:
from PETsARD import Synthesizer

### Normal - for SDV

In [6]:
sdv_methods = [
    'sdv-single_table-gaussiancopula',
]

for synthesizing_method in sdv_methods:
    print(f"Synthesizing method: {synthesizing_method}")
    syn = Synthesizer(
        method=synthesizing_method,
        epsilon=10.0,
    )
    syn.create(data=preproc_data)
    syn.fit_sample()
    print(syn.data_syn.head(1))

Synthesizing method: sdv-single_table-gaussiancopula
Synthesizer (SDV - SingleTable): Metafile loading time: 0.0059 sec.
Synthesizer (SDV - SingleTable): Fitting GaussianCopula.
Synthesizer (SDV - SingleTable): Fitting  GaussianCopula spent 0.258 sec.




Synthesizer (SDV - SingleTable): Sampling GaussianCopula # 104 rows (same as raw) in 0.0383 sec.
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0      -0.400554         -0.869795     0.988565       -0.277808  -0.369581   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0             0.812296              0.541134  0.075377 -0.005738  -0.409272   

    alcohol   quality  
0  0.682881  0.246472  


  return _boost._beta_ppf(q, a, b)


## Processor: inverse_transform()

In [7]:
postproc_data = proc.inverse_transform(
    data=syn.data_syn
)
print(postproc_data.head(1))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       7.547828          0.373109     0.479043        2.246029     0.0692   

   free sulfur dioxide  total sulfur dioxide  density        pH  sulphates  \
0             24.05694             67.765236  0.99682  3.307012   0.587451   

     alcohol   quality  
0  11.287118  5.782869  


In [8]:
from PETsARD import AutoML

In [9]:
ml = AutoML({'target': 'quality', 'method': 'regression', 'n_splits': 8})

In [10]:
ml.create({'ori': load.data, 'syn': postproc_data})

In [11]:
ml.eval()

Regression: 100%|██████████| 8/8 [00:05<00:00,  1.59it/s]
Regression: 100%|██████████| 8/8 [00:00<00:00, 10.32it/s]


In [12]:
ml.get_global()

Unnamed: 0,ori_mean,ori_std,syn_mean,syn_std,pct_change
0,0.413081,0.084311,0.034577,0.519624,-37.8504


In [13]:
ml.ml.result_ori

{'linear_regression': [0.42173980942193723,
  0.3064532412111288,
  0.3179781851278146,
  0.3349721126605292,
  0.21567202689495102,
  0.37658471549020656,
  0.3414297414873978,
  0.4395767169591305],
 'random_forest': [0.5158899055489965,
  0.4707491390491524,
  0.4660693254346062,
  0.5490043664970591,
  0.377468253968254,
  0.548078939909297,
  0.40481440677966096,
  0.5464754745333755],
 'gradient_boosting': [0.42841832519592304,
  0.4234695050338778,
  0.3882905606321122,
  0.4474806712667623,
  0.31412209510112377,
  0.45677288417329287,
  0.3354252650032965,
  0.48700875432112545]}

In [14]:
ml.ml.result_syn

{'linear_regression': [0.3783325857694889,
  -0.19504790664559812,
  0.5368086303397825,
  0.20894869945312,
  0.18931949217493005,
  -0.7099122063536645,
  0.5390040107524253,
  0.01860855415278484],
 'random_forest': [0.20218792650490114,
  -0.1690753209376641,
  0.5377055216832527,
  0.5264781828068367,
  -0.08196793989172235,
  -0.9164868645365081,
  0.5342704707710734,
  -0.22058716608816087],
 'gradient_boosting': [0.1255257931885576,
  -0.12067863994099648,
  0.507851804508025,
  0.528486114983001,
  -0.4408367115847549,
  -1.4687259859184656,
  0.5948827299565028,
  -0.27524803415049015]}