# Loader - Load from csv

In [1]:
from petsard.loader import Loader

load = Loader(filepath="benchmark/adult-income.csv")
data, meta = load.load()
print(f"data: {type(data)}, meta: {type(meta)}")

data: <class 'pandas.core.frame.DataFrame'>, meta: <class 'petsard.metadater.metadata.Schema'>


# Just simplify

In [2]:
data = data.loc[0:99, :]
data.shape

(100, 15)

# Splitter

In [3]:
from petsard.loader import Splitter

split = Splitter(num_samples=5, train_split_ratio=0.8)
split_data, split_meta, exist_index = split.split(data=data, metadata=meta)

train_data = split_data[1]["train"]  # 第一次分割的訓練集
test_data = split_data[1]["validation"]  # 第一次分割的測試集

print(f"train_data: {type(train_data)}, test_data: {type(test_data)}")
print(f"train_data shape: {train_data.shape}, test_data shape: {test_data.shape}")

train_data: <class 'pandas.core.frame.DataFrame'>, test_data: <class 'pandas.core.frame.DataFrame'>
train_data shape: (80, 15), test_data shape: (20, 15)


# Processor (Pre-proc)

In [4]:
from petsard.processor import Processor

default_proc = Processor(
    metadata=meta,
)

default_proc.fit(data=train_data)

default_preproc_data = default_proc.transform(data=train_data)
print(f"default_preproc_data shape: {default_preproc_data.shape}")

default_preproc_data shape: (54, 15)


## Processor - Missing only

In [5]:
proc = Processor(
    metadata=meta,
    config={
        "missing": {
            "age": "missing_mean",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["missing"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (80, 15)


0    25
1    38
2    28
3    18
4    29
5    63
6    24
7    36
8    58
9    20
Name: age, dtype: int64

## Processor - Outlier only

In [6]:
proc = Processor(
    metadata=meta,
    config={
        "outlier": {
            "age": "outlier_zscore",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["outlier"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (54, 15)


0    25
1    38
2    28
3    18
4    29
5    24
6    36
7    58
8    40
9    34
Name: age, dtype: int64

## Processor - Encoder only

In [7]:
proc = Processor(
    metadata=meta,
    config={
        "encoder": {
            "workclass": "encoder_onehot",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["encoder"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data.loc[
    :, [col for col in preproc_data.columns if col.startswith("workclass_")]
].head(10)

preproc_data shape: (80, 20)


Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov
0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0
6,0.0,0.0,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0


## Processor - Scaler only

In [8]:
proc = Processor(
    metadata=meta,
    config={
        "scaler": {
            "age": "scaler_minmax",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["scaler"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (80, 15)


0    0.145455
1    0.381818
2    0.200000
3    0.018182
4    0.218182
5    0.836364
6    0.127273
7    0.345455
8    0.745455
9    0.054545
Name: age, dtype: float64

# Synthesizer

In [9]:
from petsard.synthesizer import Synthesizer

syn = Synthesizer(
    method="default",
)

syn.create(metadata=meta)
syn_data = syn.fit_sample(data=default_preproc_data)
print(f"syn_data shape: {syn_data.shape}")

syn_data shape: (54, 15)


# Processor (Post-proc)

In [10]:
postproc_data = default_proc.inverse_transform(data=syn_data)
print(f"postproc_data shape: {postproc_data.shape}")

postproc_data.head(6)

postproc_data shape: (54, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,49,Private,89836,Some-college,11,Married-civ-spouse,Craft-repair,Husband,White,Female,0,0,39,United-States,<=50K
1,25,Private,173195,Assoc-voc,9,Divorced,Exec-managerial,Wife,White,Female,0,0,36,United-States,>50K
2,25,Private,86727,Bachelors,10,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,35,United-States,>50K
3,32,Private,255791,Masters,9,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,37,United-States,<=50K
4,25,Private,76049,Masters,12,Never-married,Prof-specialty,Wife,White,Male,0,0,56,United-States,<=50K
5,18,Federal-gov,296675,Bachelors,10,Divorced,Sales,Not-in-family,White,Female,0,0,37,United-States,<=50K


# Constrainer

In [11]:
from petsard.constrainer import Constrainer

config = {}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (54, 15)
satisfy_data shape: (54, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,29,Private,106772,Assoc-voc,14,Married-civ-spouse,Tech-support,Husband,Other,Male,0,0,42,United-States,<=50K
1,18,Private,115535,Masters,12,Divorced,Other-service,Not-in-family,White,Male,0,0,48,United-States,<=50K
2,27,Private,217184,9th,13,Married-civ-spouse,Exec-managerial,Own-child,Black,Male,0,0,47,United-States,<=50K
3,46,Private,174174,HS-grad,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,41,United-States,<=50K
4,19,?,193169,Some-college,9,Never-married,?,Own-child,White,Male,0,0,39,United-States,<=50K
5,18,Federal-gov,296675,Bachelors,10,Divorced,Sales,Not-in-family,White,Female,0,0,37,United-States,<=50K


## Constrainer - nan_groups

In [12]:
config = {
    "nan_groups": {
        "gender": "delete",  # gender 缺值的紀錄直接刪除
        "age": {  # age 缺值的紀錄把 workclass, education, education-num 欄位設為缺值
            "erase": ["workclass", "education", "educational-num"],
        },
        "marital-status": {
            "copy": "relationship",  # marital-status 缺值的紀錄把 relationship 欄位的值複製過來
        },
    }
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (54, 15)
satisfy_data shape: (54, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,29,Private,106772,Assoc-voc,14.0,Married-civ-spouse,Tech-support,Husband,Other,Male,0,0,42,United-States,<=50K
1,18,Private,115535,Masters,12.0,Divorced,Other-service,Not-in-family,White,Male,0,0,48,United-States,<=50K
2,27,Private,217184,9th,13.0,Married-civ-spouse,Exec-managerial,Own-child,Black,Male,0,0,47,United-States,<=50K
3,46,Private,174174,HS-grad,13.0,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,41,United-States,<=50K
4,19,?,193169,Some-college,9.0,Never-married,?,Own-child,White,Male,0,0,39,United-States,<=50K
5,18,Federal-gov,296675,Bachelors,10.0,Divorced,Sales,Not-in-family,White,Female,0,0,37,United-States,<=50K


## Constrainer - field_constraints

In [13]:
config = {
    "field_constraints": [
        "age >= 20 & age <= 70",  # age 欄位的值必須在 20 到 70 歲之間
    ],
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=satisfy_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (48, 15)
satisfy_data shape: (54, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,67,Private,161394,10th,10.0,Never-married,Farming-fishing,Own-child,White,Male,0,0,41,United-States,<=50K
1,61,Private,160580,Bachelors,10.0,Divorced,Adm-clerical,Not-in-family,White,Male,0,0,41,United-States,<=50K
2,34,Private,206022,Some-college,10.0,Married-civ-spouse,Handlers-cleaners,Husband,White,Female,0,0,35,Dominican-Republic,<=50K
3,53,Private,403681,Some-college,6.0,Never-married,Other-service,Unmarried,White,Female,0,0,36,United-States,<=50K
4,40,Private,53363,Masters,12.0,Separated,Exec-managerial,Not-in-family,White,Female,0,0,45,United-States,<=50K
5,25,Private,166961,HS-grad,10.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,39,United-States,<=50K


## Constrainer - field_combinations

In [14]:
config = {
    "field_combinations": [
        (
            {
                "marital-status": "relationship"
            },  # 指定 'marital-status' 和 'relationship' 欄位之間的關係
            {  # 定義每個 'marital-status' 對應的 'relationship' 欄位值
                "Divorced": ["Not-in-family", "Own-child", "Unmarried"],
                "Married-civ-spouse": ["Husband", "Wife"],
                "Married-spouse-absent": ["Not-in-family"],
                "Separated": ["Not-in-family", "Unmarried"],
                "Never-married": ["Not-in-family", "Own-child", "Unmarried"],
                "Widowed": ["Not-in-family", "Unmarried"],
            },
        )
    ],
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=satisfy_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (44, 15)
satisfy_data shape: (54, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,19,Private,221035,HS-grad,13.0,Never-married,Handlers-cleaners,Not-in-family,White,Female,0,0,43,United-States,<=50K
1,67,Private,161394,10th,10.0,Never-married,Farming-fishing,Own-child,White,Male,0,0,41,United-States,<=50K
2,18,Private,211033,Bachelors,10.0,Never-married,Exec-managerial,Unmarried,White,Female,0,0,31,United-States,<=50K
3,28,Local-gov,99762,Assoc-acdm,15.0,Never-married,Protective-serv,Own-child,White,Male,0,0,41,United-States,>50K
4,30,?,33290,HS-grad,13.0,Never-married,?,Not-in-family,White,Male,0,0,43,United-States,<=50K
5,46,Private,322120,11th,8.0,Divorced,Farming-fishing,Not-in-family,White,Male,0,0,38,United-States,>50K


In [15]:
from petsard.metadater import SchemaMetadater

satisfy_data = SchemaMetadater.align(schema=meta, data=satisfy_data)
satisfy_data.head(6)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,19,Private,221035,HS-grad,13,Never-married,Handlers-cleaners,Not-in-family,White,Female,0,0,43,United-States,<=50K
1,67,Private,161394,10th,10,Never-married,Farming-fishing,Own-child,White,Male,0,0,41,United-States,<=50K
2,18,Private,211033,Bachelors,10,Never-married,Exec-managerial,Unmarried,White,Female,0,0,31,United-States,<=50K
3,28,Local-gov,99762,Assoc-acdm,15,Never-married,Protective-serv,Own-child,White,Male,0,0,41,United-States,>50K
4,30,?,33290,HS-grad,13,Never-married,?,Not-in-family,White,Male,0,0,43,United-States,<=50K
5,46,Private,322120,11th,8,Divorced,Farming-fishing,Not-in-family,White,Male,0,0,38,United-States,>50K


# Evaluator

In [16]:
from petsard.evaluator import Evaluator

evaluation = Evaluator(method="default")
evaluation.create()

train_data = SchemaMetadater.align(schema=meta, data=train_data)
test_data = SchemaMetadater.align(schema=meta, data=test_data)

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 2332.85it/s]|
Column Shapes Score: 87.57%

(2/2) Evaluating Column Pair Trends: |          | 0/105 [00:00<?, ?it/s]|

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 844.43it/s]|
Column Pair Trends Score: 64.0%

Overall Score (Average): 75.78%

        Score  Column Shapes  Column Pair Trends
result   0.76           0.88                0.64
          Property        Metric     Score
age  Column Shapes  KSComplement  0.922685
                         Property                 Metric     Score  \
age workclass  Column Pair Trends  ContingencySimilarity  0.574074   

               Real Correlation  Synthetic Correlation Error  
age workclass               NaN                    NaN  None  


## Evaluator - Singling-out

In [17]:
evaluation = Evaluator(
    method="anonymeter-singlingout",
    n_attacks=3,
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

Found 3 failed queries out of 3. Check DEBUG messages for more details.


        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.2          0.0          0.8         0.43             0.37   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.28               0.28          0.28              0.28  
{'attack_queries': ["education== '11th' & race== 'White' & workclass== 'Self-emp-not-inc'"], 'baseline_queries': []}


## Evaluator - Linkability

In [18]:
evaluation = Evaluator(
    method="anonymeter-linkability",
    n_attacks=3,
    aux_cols=[
        ["age", "marital-status", "relationship", "race", "gender"],
        ["workclass", "education", "educational-num", "occupation"],
    ],
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.0          0.0         0.55         0.28             0.28   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.28               0.28          0.28              0.28  
{'attack_links': {}, 'baseline_links': {}, 'control_links': {}}


  self._sanity_check()


## Evaluator - Inference

In [19]:
evaluation = Evaluator(
    method="anonymeter-inference",
    n_attacks=3,
    secret="income",
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result  0.34          0.0          1.0         0.72             0.28   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.72               0.28          0.57              0.37  
None


  self._sanity_check()


## Evaluator - Diagnostic

In [20]:
eval_result

{'global':         risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
 result  0.34          0.0          1.0         0.72             0.28   
 
         baseline_rate  baseline_rate_err  control_rate  control_rate_err  
 result           0.72               0.28          0.57              0.37  ,
 'details': None}

In [21]:
evaluation = Evaluator(method="sdmetrics-diagnosticreport")
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
# print(eval_result["pairwise"].head(1)) - no pairwise

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 6757.01it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 1559.80it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

        Score  Data Validity  Data Structure
result    1.0            1.0             1.0
          Property             Metric  Score
age  Data Validity  BoundaryAdherence    1.0


## Evaluator - Quality

In [22]:
evaluation = Evaluator(method="sdmetrics-qualityreport")
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 4150.58it/s]|
Column Shapes Score: 87.57%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 824.42it/s]|
Column Pair Trends Score: 64.0%

Overall Score (Average): 75.78%

        Score  Column Shapes  Column Pair Trends
result   0.76           0.88                0.64
          Property        Metric     Score
age  Column Shapes  KSComplement  0.922685
                         Property                 Metric     Score  \
age workclass  Column Pair Trends  ContingencySimilarity  0.574074   

               Real Correlation  Synthetic Correlation Error  
age workclass               NaN                    NaN  None  


In [23]:
evaluation = Evaluator(
    method="mlutility-classification",
    target="income",
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))

The cardinality of the column workclass is too high. Ori: Over row numbers 80, column cardinality 7. Syn: Over row numbers 54, column cardinality 6. The column workclass is removed.
The cardinality of the column education is too high. Ori: Over row numbers 80, column cardinality 13. Syn: Over row numbers 54, column cardinality 10. The column education is removed.
The cardinality of the column marital-status is too high. Ori: Over row numbers 80, column cardinality 6. Syn: Over row numbers 54, column cardinality 5. The column marital-status is removed.
The cardinality of the column occupation is too high. Ori: Over row numbers 80, column cardinality 14. Syn: Over row numbers 54, column cardinality 13. The column occupation is removed.


   ori_mean  ori_std  syn_mean  syn_std  diff
0      0.78     0.11      0.65      0.0 -0.13


## Evaluator - Classification

# Describer

In [24]:
from petsard.evaluator import Describer


desc = Describer(method="default")
desc.create()

desc_result = desc.eval(
    data={
        "data": satisfy_data,
    }
)

print(desc_result["global"].head(1))
print(desc_result["columnwise"].head(1))
print(desc_result["pairwise"].head(1))

   row_count  col_count  na_count
0         54         15         0
      mean  median   std   min   max  kurtosis  skew    q1     q3  na_count  \
age  37.43    34.0  15.6  17.0  69.0     -0.99  0.48  25.0  48.75         0   

    nunique  
age    <NA>  
  column1 column2  corr
0     age     age   1.0


# Reporter

## Reporter - Save Data

In [25]:
from petsard.reporter import Reporter

rpt = Reporter(
    method="save_data",
    source="Postprocessor",
)
rpt.create({("Postprocessor", "exp1"): satisfy_data})
rpt.report()

{}

## Reporter - Save Report

In [26]:
rpt = Reporter(
    method="save_report",
    granularity="global",
)
rpt.create(
    {
        ("Evaluator", "eval1_[global]"): desc_result["global"],
    }
)
rpt.report()

{}