## Loader - Benchmark

In [1]:
from petsard.loader import Loader

load = Loader(filepath="benchmark://adult-income")
data, meta = load.load()
print(f"data: {type(data)}, meta: {type(meta)}")

data: <class 'pandas.core.frame.DataFrame'>, meta: <class 'petsard.metadater.metadata.Schema'>


# Loader - Load from csv

In [2]:
load = Loader(filepath="benchmark/adult-income.csv")
data, meta = load.load()
print(f"data: {type(data)}, meta: {type(meta)}")

data: <class 'pandas.core.frame.DataFrame'>, meta: <class 'petsard.metadater.metadata.Schema'>


# Just simplify

In [3]:
data = data.loc[0:99, :]
data.shape

(100, 15)

# Splitter

In [4]:
from petsard.loader import Splitter

split = Splitter(num_samples=5, train_split_ratio=0.8)
split_data, split_meta, exist_index = split.split(data=data, metadata=meta)

train_data = split_data[1]["train"]  # 第一次分割的訓練集
test_data = split_data[1]["validation"]  # 第一次分割的測試集

print(f"train_data: {type(train_data)}, test_data: {type(test_data)}")
print(f"train_data shape: {train_data.shape}, test_data shape: {test_data.shape}")

train_data: <class 'pandas.core.frame.DataFrame'>, test_data: <class 'pandas.core.frame.DataFrame'>
train_data shape: (80, 15), test_data shape: (20, 15)


# Processor (Pre-proc)

In [5]:
from petsard.processor import Processor

default_proc = Processor(
    metadata=meta,
)

default_proc.fit(data=train_data)

default_preproc_data = default_proc.transform(data=train_data)
print(f"default_preproc_data shape: {default_preproc_data.shape}")

default_preproc_data shape: (53, 15)


## Processor - Missing only

In [6]:
proc = Processor(
    metadata=meta,
    config={
        "missing": {
            "age": "missing_mean",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["missing"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (80, 15)


0    28
1    44
2    18
3    34
4    63
5    24
6    55
7    65
8    36
9    26
Name: age, dtype: int64

## Processor - Outlier only

In [7]:
proc = Processor(
    metadata=meta,
    config={
        "outlier": {
            "age": "outlier_zscore",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["outlier"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (53, 15)


0    28
1    18
2    34
3    24
4    36
5    26
6    58
7    43
8    43
9    34
Name: age, dtype: int64

## Processor - Encoder only

In [8]:
proc = Processor(
    metadata=meta,
    config={
        "encoder": {
            "workclass": "encoder_onehot",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["encoder"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data.loc[
    :, [col for col in preproc_data.columns if col.startswith("workclass_")]
].head(10)

preproc_data shape: (80, 20)


Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov
0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,0.0
7,0.0,0.0,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,1.0,0.0,0.0,0.0


## Processor - Scaler only

In [9]:
proc = Processor(
    metadata=meta,
    config={
        "scaler": {
            "age": "scaler_minmax",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["scaler"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (80, 15)


0    0.211538
1    0.519231
2    0.019231
3    0.326923
4    0.884615
5    0.134615
6    0.730769
7    0.923077
8    0.365385
9    0.173077
Name: age, dtype: float64

# Synthesizer

In [10]:
from petsard.synthesizer import Synthesizer

syn = Synthesizer(
    method="default",
)

syn.create(metadata=meta)
syn_data = syn.fit_sample(data=default_preproc_data)
print(f"syn_data shape: {syn_data.shape}")

syn_data shape: (53, 15)


# Processor (Post-proc)

In [11]:
postproc_data = default_proc.inverse_transform(data=syn_data)
print(f"postproc_data shape: {postproc_data.shape}")

postproc_data.head(6)

postproc_data shape: (53, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,54,Private,160109,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,46,United-States,<=50K
1,36,Private,272367,Assoc-voc,7,Married-civ-spouse,Prof-specialty,Own-child,White,Female,0,0,39,United-States,<=50K
2,45,Private,343417,Some-college,9,Married-civ-spouse,?,Husband,White,Female,0,0,44,United-States,>50K
3,33,Local-gov,182806,HS-grad,7,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,36,United-States,<=50K
4,27,Private,220070,Assoc-voc,13,Married-civ-spouse,Prof-specialty,Husband,White,Female,0,0,46,United-States,<=50K
5,27,Private,316424,Some-college,7,Never-married,Adm-clerical,Not-in-family,Amer-Indian-Eskimo,Male,0,0,33,United-States,<=50K


# Constrainer

In [12]:
from petsard.constrainer import Constrainer

config = {}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (53, 15)
satisfy_data shape: (53, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,55,Self-emp-inc,290469,HS-grad,13,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,42,United-States,<=50K
1,51,Private,216929,HS-grad,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,48,United-States,<=50K
2,54,Federal-gov,266995,Some-college,13,Separated,Craft-repair,Not-in-family,White,Male,0,0,39,United-States,<=50K
3,58,Private,146279,HS-grad,12,Never-married,Transport-moving,Not-in-family,White,Female,0,0,40,United-States,<=50K
4,20,Private,122081,Some-college,12,Married-civ-spouse,Handlers-cleaners,Husband,White,Female,0,0,37,United-States,<=50K
5,27,Private,316424,Some-college,7,Never-married,Adm-clerical,Not-in-family,Amer-Indian-Eskimo,Male,0,0,33,United-States,<=50K


## Constrainer - nan_groups

In [13]:
config = {
    "nan_groups": {
        "gender": "delete",  # gender 缺值的紀錄直接刪除
        "age": {  # age 缺值的紀錄把 workclass, education, education-num 欄位設為缺值
            "erase": ["workclass", "education", "educational-num"],
        },
        "marital-status": {
            "copy": "relationship",  # marital-status 缺值的紀錄把 relationship 欄位的值複製過來
        },
    }
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (53, 15)
satisfy_data shape: (53, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,55,Self-emp-inc,290469,HS-grad,13.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,42,United-States,<=50K
1,51,Private,216929,HS-grad,13.0,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,48,United-States,<=50K
2,54,Federal-gov,266995,Some-college,13.0,Separated,Craft-repair,Not-in-family,White,Male,0,0,39,United-States,<=50K
3,58,Private,146279,HS-grad,12.0,Never-married,Transport-moving,Not-in-family,White,Female,0,0,40,United-States,<=50K
4,20,Private,122081,Some-college,12.0,Married-civ-spouse,Handlers-cleaners,Husband,White,Female,0,0,37,United-States,<=50K
5,27,Private,316424,Some-college,7.0,Never-married,Adm-clerical,Not-in-family,Amer-Indian-Eskimo,Male,0,0,33,United-States,<=50K


## Constrainer - field_constraints

In [14]:
config = {
    "field_constraints": [
        "age >= 20 & age <= 70",  # age 欄位的值必須在 20 到 70 歲之間
    ],
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=satisfy_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (50, 15)
satisfy_data shape: (53, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Federal-gov,72906,Bachelors,10.0,Never-married,Armed-Forces,Not-in-family,White,Male,0,0,43,United-States,<=50K
1,35,Private,92657,HS-grad,6.0,Separated,Machine-op-inspct,Not-in-family,White,Male,0,0,34,Mexico,<=50K
2,48,Private,221829,Some-college,13.0,Married-civ-spouse,?,Unmarried,White,Male,0,0,45,United-States,>50K
3,27,?,254709,Some-college,12.0,Married-civ-spouse,Adm-clerical,Unmarried,White,Male,0,0,34,United-States,<=50K
4,20,Private,150589,Some-college,8.0,Married-civ-spouse,Machine-op-inspct,Not-in-family,White,Male,0,0,43,United-States,<=50K
5,37,Private,222354,HS-grad,13.0,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,37,United-States,<=50K


## Constrainer - field_combinations

In [15]:
config = {
    "field_combinations": [
        (
            {
                "marital-status": "relationship"
            },  # 指定 'marital-status' 和 'relationship' 欄位之間的關係
            {  # 定義每個 'marital-status' 對應的 'relationship' 欄位值
                "Divorced": ["Not-in-family", "Own-child", "Unmarried"],
                "Married-civ-spouse": ["Husband", "Wife"],
                "Married-spouse-absent": ["Not-in-family"],
                "Separated": ["Not-in-family", "Unmarried"],
                "Never-married": ["Not-in-family", "Own-child", "Unmarried"],
                "Widowed": ["Not-in-family", "Unmarried"],
            },
        )
    ],
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=satisfy_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (28, 15)
satisfy_data shape: (53, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Federal-gov,72906,Bachelors,10.0,Never-married,Armed-Forces,Not-in-family,White,Male,0,0,43,United-States,<=50K
1,31,Private,263822,10th,7.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,41,United-States,>50K
2,36,Federal-gov,217338,Bachelors,12.0,Separated,Craft-repair,Not-in-family,White,Male,0,0,42,United-States,<=50K
3,55,Private,183171,Assoc-voc,9.0,Married-civ-spouse,Craft-repair,Husband,White,Female,0,0,33,United-States,>50K
4,58,Private,119388,Some-college,10.0,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,32,United-States,<=50K
5,49,Private,102030,HS-grad,12.0,Married-civ-spouse,Transport-moving,Husband,White,Female,0,0,45,United-States,<=50K


In [16]:
from petsard.metadater import SchemaMetadater

satisfy_data = SchemaMetadater.align(schema=meta, data=satisfy_data)
satisfy_data.head(6)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Federal-gov,72906,Bachelors,10,Never-married,Armed-Forces,Not-in-family,White,Male,0,0,43,United-States,<=50K
1,31,Private,263822,10th,7,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,41,United-States,>50K
2,36,Federal-gov,217338,Bachelors,12,Separated,Craft-repair,Not-in-family,White,Male,0,0,42,United-States,<=50K
3,55,Private,183171,Assoc-voc,9,Married-civ-spouse,Craft-repair,Husband,White,Female,0,0,33,United-States,>50K
4,58,Private,119388,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,32,United-States,<=50K
5,49,Private,102030,HS-grad,12,Married-civ-spouse,Transport-moving,Husband,White,Female,0,0,45,United-States,<=50K


# Evaluator

In [17]:
from petsard.evaluator import Evaluator

evaluation = Evaluator(method="default")
evaluation.create()

train_data = SchemaMetadater.align(schema=meta, data=train_data)
test_data = SchemaMetadater.align(schema=meta, data=test_data)

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 3461.98it/s]|
Column Shapes Score: 87.12%

(2/2) Evaluating Column Pair Trends: |          | 0/105 [00:00<?, ?it/s]|

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 697.84it/s]|
Column Pair Trends Score: 61.06%

Overall Score (Average): 74.09%

        Score  Column Shapes  Column Pair Trends
result   0.74           0.87                0.61
          Property        Metric     Score
age  Column Shapes  KSComplement  0.828774
                         Property                 Metric     Score  \
age workclass  Column Pair Trends  ContingencySimilarity  0.652123   

               Real Correlation  Synthetic Correlation Error  
age workclass               NaN                    NaN  None  


## Evaluator - Singling-out

In [18]:
evaluation = Evaluator(
    method="anonymeter-singlingout",
    n_attacks=3,
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

Found 3 failed queries out of 3. Check DEBUG messages for more details.


        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.0          0.0         0.55         0.28             0.28   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.28               0.28          0.28              0.28  
{'attack_queries': [], 'baseline_queries': []}


  self._sanity_check()


## Evaluator - Linkability

In [19]:
evaluation = Evaluator(
    method="anonymeter-linkability",
    n_attacks=3,
    aux_cols=[
        ["age", "marital-status", "relationship", "race", "gender"],
        ["workclass", "education", "educational-num", "occupation"],
    ],
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.0          0.0         0.55         0.28             0.28   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.28               0.28          0.28              0.28  
{'attack_links': {}, 'baseline_links': {}, 'control_links': {}}


  self._sanity_check()


## Evaluator - Inference

In [20]:
evaluation = Evaluator(
    method="anonymeter-inference",
    n_attacks=3,
    secret="income",
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.0          0.0          1.0         0.72             0.28   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.57               0.37          0.72              0.28  
None


## Evaluator - Diagnostic

In [21]:
eval_result

{'global':         risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
 result   0.0          0.0          1.0         0.72             0.28   
 
         baseline_rate  baseline_rate_err  control_rate  control_rate_err  
 result           0.57               0.37          0.72              0.28  ,
 'details': None}

In [22]:
evaluation = Evaluator(method="sdmetrics-diagnosticreport")
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
# print(eval_result["pairwise"].head(1)) - no pairwise

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 6456.09it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 1282.27it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

        Score  Data Validity  Data Structure
result    1.0            1.0             1.0
          Property             Metric  Score
age  Data Validity  BoundaryAdherence    1.0


## Evaluator - Quality

In [23]:
evaluation = Evaluator(method="sdmetrics-qualityreport")
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 2867.97it/s]|
Column Shapes Score: 87.12%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 958.20it/s]|
Column Pair Trends Score: 61.06%

Overall Score (Average): 74.09%

        Score  Column Shapes  Column Pair Trends
result   0.74           0.87                0.61
          Property        Metric     Score
age  Column Shapes  KSComplement  0.828774
                         Property                 Metric     Score  \
age workclass  Column Pair Trends  ContingencySimilarity  0.652123   

               Real Correlation  Synthetic Correlation Error  
age workclass               NaN                    NaN  None  


In [24]:
evaluation = Evaluator(
    method="mlutility-classification",
    target="income",
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))

The cardinality of the column workclass is too high. Ori: Over row numbers 80, column cardinality 7. Syn: Over row numbers 53, column cardinality 6. The column workclass is removed.
The cardinality of the column education is too high. Ori: Over row numbers 80, column cardinality 13. Syn: Over row numbers 53, column cardinality 7. The column education is removed.
The cardinality of the column occupation is too high. Ori: Over row numbers 80, column cardinality 15. Syn: Over row numbers 53, column cardinality 12. The column occupation is removed.
The cardinality of the column relationship is too high. Ori: Over row numbers 80, column cardinality 5. Syn: Over row numbers 53, column cardinality 5. The column relationship is removed.


   ori_mean  ori_std  syn_mean  syn_std  diff
0      0.95     0.04      0.81     0.06 -0.14


## Evaluator - Classification

# Describer

In [25]:
from petsard.evaluator import Describer


desc = Describer(method="default")
desc.create()

desc_result = desc.eval(
    data={
        "data": satisfy_data,
    }
)

print(desc_result["global"].head(1))
print(desc_result["columnwise"].head(1))
print(desc_result["pairwise"].head(1))

   row_count  col_count  na_count
0         53         15         0
      mean  median   std   min   max  kurtosis  skew    q1    q3  na_count  \
age  41.62    41.0  15.2  17.0  69.0     -1.21  0.14  30.0  55.0         0   

    nunique  
age    <NA>  
  column1 column2  corr
0     age     age   1.0


# Reporter

## Reporter - Save Data

In [26]:
from petsard.reporter import Reporter

rpt = Reporter(
    method="save_data",
    source="Postprocessor",
)
rpt.create({("Postprocessor", "exp1"): satisfy_data})
rpt.report()

{}

## Reporter - Save Report

In [27]:
rpt = Reporter(
    method="save_report",
    granularity="global",
)
rpt.create(
    {
        ("Evaluator", "eval1_[global]"): desc_result["global"],
    }
)
rpt.report()

{}