## Loader - Benchmark

In [1]:
from petsard.loader import Loader

load = Loader(filepath="benchmark://adult-income")
data, meta = load.load()
print(f"data: {type(data)}, meta: {type(meta)}")

data: <class 'pandas.core.frame.DataFrame'>, meta: <class 'petsard.metadater.metadata.Schema'>


# Loader - Load from csv

In [2]:
load = Loader(filepath="benchmark/adult-income.csv")
data, meta = load.load()
print(f"data: {type(data)}, meta: {type(meta)}")

data: <class 'pandas.core.frame.DataFrame'>, meta: <class 'petsard.metadater.metadata.Schema'>


# Just simplify

In [3]:
data = data.loc[0:99, :]
data.shape

(100, 15)

# Splitter

In [4]:
from petsard.loader import Splitter

split = Splitter(num_samples=5, train_split_ratio=0.8)
split_data, split_meta, exist_index = split.split(data=data, metadata=meta)

train_data = split_data[1]["train"]  # 第一次分割的訓練集
test_data = split_data[1]["validation"]  # 第一次分割的測試集

print(f"train_data: {type(train_data)}, test_data: {type(test_data)}")
print(f"train_data shape: {train_data.shape}, test_data shape: {test_data.shape}")

train_data: <class 'pandas.core.frame.DataFrame'>, test_data: <class 'pandas.core.frame.DataFrame'>
train_data shape: (80, 15), test_data shape: (20, 15)


## Splitter - custom_data

In [5]:
split = Splitter(
    method="custom_data",
    filepath={
        "ori": "benchmark://adult-income_ori",
        "control": "benchmark://adult-income_control",
    },
)
split_data, split_meta, exist_index = split.split()

train_data = split_data[1]["train"]  # 第一次分割的訓練集
test_data = split_data[1]["validation"]  # 第一次分割的測試集

print(f"train_data: {type(train_data)}, test_data: {type(test_data)}")
print(f"train_data shape: {train_data.shape}, test_data shape: {test_data.shape}")

train_data: <class 'pandas.core.frame.DataFrame'>, test_data: <class 'pandas.core.frame.DataFrame'>
train_data shape: (39073, 15), test_data shape: (9769, 15)


# Processor (Pre-proc)

In [6]:
from petsard.processor import Processor

default_proc = Processor(
    metadata=meta,
)

default_proc.fit(data=train_data)

default_preproc_data = default_proc.transform(data=train_data)
print(f"default_preproc_data shape: {default_preproc_data.shape}")

default_preproc_data shape: (22795, 15)


## Processor - Missing only

In [7]:
proc = Processor(
    metadata=meta,
    config={
        "missing": {
            "age": "missing_mean",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["missing"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (39073, 15)


0    25
1    38
2    28
3    44
4    18
5    34
6    29
7    24
8    55
9    36
Name: age, dtype: int64

## Processor - Outlier only

In [8]:
proc = Processor(
    metadata=meta,
    config={
        "outlier": {
            "age": "outlier_zscore",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["outlier"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (22799, 15)


0    25
1    38
2    28
3    29
4    24
5    36
6    26
7    58
8    43
9    40
Name: age, dtype: int64

## Processor - Encoder only

In [9]:
proc = Processor(
    metadata=meta,
    config={
        "encoder": {
            "workclass": "encoder_onehot",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["encoder"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data.loc[
    :, [col for col in preproc_data.columns if col.startswith("workclass_")]
].head(10)

preproc_data shape: (39073, 22)


Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Processor - Scaler only

In [10]:
proc = Processor(
    metadata=meta,
    config={
        "scaler": {
            "age": "scaler_minmax",
        },
    },
)

proc.fit(
    data=train_data,
    sequence=["scaler"],
)

preproc_data = proc.transform(data=train_data)
print(f"preproc_data shape: {preproc_data.shape}")
preproc_data["age"].head(10)

preproc_data shape: (39073, 15)


0    0.109589
1    0.287671
2    0.150685
3    0.369863
4    0.013699
5    0.232877
6    0.164384
7    0.095890
8    0.520548
9    0.260274
Name: age, dtype: float64

# Synthesizer

In [11]:
from petsard.synthesizer import Synthesizer

syn = Synthesizer(
    method="default",
)

syn.create(metadata=meta)
syn_data = syn.fit_sample(data=default_preproc_data)
print(f"syn_data shape: {syn_data.shape}")

syn_data shape: (22795, 15)


# Processor (Post-proc)

In [12]:
postproc_data = default_proc.inverse_transform(data=syn_data)
print(f"postproc_data shape: {postproc_data.shape}")

postproc_data.head(6)

postproc_data shape: (22795, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,46,Local-gov,105948,Some-college,11,Never-married,Prof-specialty,Husband,White,Male,0,0,39,United-States,<=50K
1,30,Private,141570,10th,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,37,United-States,<=50K
2,31,Private,73445,HS-grad,12,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,37,United-States,<=50K
3,35,Federal-gov,210931,HS-grad,8,Separated,?,Husband,White,Male,0,0,38,United-States,<=50K
4,29,Local-gov,95785,Bachelors,12,Divorced,Machine-op-inspct,Not-in-family,White,Female,0,0,52,United-States,<=50K
5,22,Private,154947,HS-grad,6,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,38,United-States,<=50K


# Constrainer

In [13]:
from petsard.constrainer import Constrainer

config = {}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (22795, 15)
satisfy_data shape: (22795, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,30,Local-gov,182033,HS-grad,10,Divorced,Tech-support,Own-child,Asian-Pac-Islander,Female,0,0,41,United-States,<=50K
1,28,?,59646,Bachelors,12,Divorced,?,Not-in-family,Black,Male,0,0,35,United-States,<=50K
2,22,Private,88650,HS-grad,15,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,34,United-States,>50K
3,41,Private,214047,11th,9,Never-married,Other-service,Husband,White,Male,0,0,39,United-States,<=50K
4,45,Private,209458,HS-grad,9,Divorced,Exec-managerial,Husband,Black,Male,0,0,45,United-States,>50K
5,28,Self-emp-not-inc,316364,Some-college,11,Married-civ-spouse,?,Own-child,White,Female,0,0,48,United-States,<=50K


## Constrainer - nan_groups

In [14]:
config = {
    "nan_groups": {
        "gender": "delete",  # gender 缺值的紀錄直接刪除
        "age": {  # age 缺值的紀錄把 workclass, education, education-num 欄位設為缺值
            "erase": ["workclass", "education", "educational-num"],
        },
        "marital-status": {
            "copy": "relationship",  # marital-status 缺值的紀錄把 relationship 欄位的值複製過來
        },
    }
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=postproc_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (22795, 15)
satisfy_data shape: (22795, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,30,Local-gov,182033,HS-grad,10.0,Divorced,Tech-support,Own-child,Asian-Pac-Islander,Female,0,0,41,United-States,<=50K
1,28,?,59646,Bachelors,12.0,Divorced,?,Not-in-family,Black,Male,0,0,35,United-States,<=50K
2,22,Private,88650,HS-grad,15.0,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,34,United-States,>50K
3,41,Private,214047,11th,9.0,Never-married,Other-service,Husband,White,Male,0,0,39,United-States,<=50K
4,45,Private,209458,HS-grad,9.0,Divorced,Exec-managerial,Husband,Black,Male,0,0,45,United-States,>50K
5,28,Self-emp-not-inc,316364,Some-college,11.0,Married-civ-spouse,?,Own-child,White,Female,0,0,48,United-States,<=50K


## Constrainer - field_constraints

In [15]:
config = {
    "field_constraints": [
        "age >= 20 & age <= 70",  # age 欄位的值必須在 20 到 70 歲之間
    ],
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=satisfy_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (21968, 15)
satisfy_data shape: (22795, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,34,Private,162906,HS-grad,6.0,Married-civ-spouse,Transport-moving,Not-in-family,White,Female,0,0,36,United-States,<=50K
1,34,Private,188970,HS-grad,13.0,Married-spouse-absent,Machine-op-inspct,Unmarried,White,Female,0,0,37,United-States,<=50K
2,33,Private,121078,HS-grad,11.0,Married-civ-spouse,Machine-op-inspct,Unmarried,Asian-Pac-Islander,Female,0,0,34,United-States,<=50K
3,31,Private,97305,Masters,8.0,Married-civ-spouse,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States,<=50K
4,28,Private,124722,HS-grad,10.0,Married-civ-spouse,Craft-repair,Not-in-family,White,Male,0,0,42,United-States,<=50K
5,28,?,48057,Masters,10.0,Never-married,Handlers-cleaners,Husband,Black,Female,0,0,36,United-States,<=50K


## Constrainer - field_combinations

In [16]:
config = {
    "field_combinations": [
        (
            {
                "marital-status": "relationship"
            },  # 指定 'marital-status' 和 'relationship' 欄位之間的關係
            {  # 定義每個 'marital-status' 對應的 'relationship' 欄位值
                "Divorced": ["Not-in-family", "Own-child", "Unmarried"],
                "Married-civ-spouse": ["Husband", "Wife"],
                "Married-spouse-absent": ["Not-in-family"],
                "Separated": ["Not-in-family", "Unmarried"],
                "Never-married": ["Not-in-family", "Own-child", "Unmarried"],
                "Widowed": ["Not-in-family", "Unmarried"],
            },
        )
    ],
}
cnst = Constrainer(config=config)
cnst_data = cnst.apply(df=postproc_data)
print(f"cnst_data shape: {cnst_data.shape}")

satisfy_data = cnst.resample_until_satisfy(
    data=satisfy_data,
    target_rows=postproc_data.shape[0],
    synthesizer=syn,
    postprocessor=default_proc,
    max_trials=300,
    sampling_ratio=10.0,
    verbose_step=10,
)
print(f"satisfy_data shape: {satisfy_data.shape}")

satisfy_data.head(6)

cnst_data shape: (11301, 15)
satisfy_data shape: (22795, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,34,Private,157882,HS-grad,7.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,<=50K
1,44,Local-gov,161486,Some-college,11.0,Married-civ-spouse,?,Husband,Black,Male,0,0,41,United-States,<=50K
2,41,Private,346758,Some-college,9.0,Never-married,Handlers-cleaners,Own-child,Black,Male,0,0,52,South,<=50K
3,43,Private,306048,Some-college,9.0,Divorced,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K
4,33,Private,210268,Some-college,6.0,Widowed,Craft-repair,Not-in-family,White,Male,0,0,36,United-States,<=50K
5,27,Local-gov,416059,Bachelors,8.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K


In [17]:
from petsard.metadater import SchemaMetadater

satisfy_data = SchemaMetadater.align(schema=meta, data=satisfy_data)
satisfy_data.head(6)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,34,Private,157882,HS-grad,7,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,<=50K
1,44,Local-gov,161486,Some-college,11,Married-civ-spouse,?,Husband,Black,Male,0,0,41,United-States,<=50K
2,41,Private,346758,Some-college,9,Never-married,Handlers-cleaners,Own-child,Black,Male,0,0,52,South,<=50K
3,43,Private,306048,Some-college,9,Divorced,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K
4,33,Private,210268,Some-college,6,Widowed,Craft-repair,Not-in-family,White,Male,0,0,36,United-States,<=50K
5,27,Local-gov,416059,Bachelors,8,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K


# Evaluator

In [19]:
from petsard.evaluator import Evaluator

evaluation = Evaluator(method="default")
evaluation.create()

train_data = SchemaMetadater.align(schema=meta, data=train_data)
test_data = SchemaMetadater.align(schema=meta, data=test_data)

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 90.81it/s]|
Column Shapes Score: 93.63%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 228.30it/s]|
Column Pair Trends Score: 60.19%

Overall Score (Average): 76.91%

        Score  Column Shapes  Column Pair Trends
result   0.77           0.94                 0.6
          Property        Metric     Score
age  Column Shapes  KSComplement  0.958908
                         Property                 Metric     Score  \
age workclass  Column Pair Trends  ContingencySimilarity  0.859326   

               Real Correlation  Synthetic Correlation Error  
age workclass               NaN                    NaN  None  


## Evaluator - Singling-out

In [20]:
evaluation = Evaluator(
    method="anonymeter-singlingout",
    n_attacks=3,
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

Found 1 failed queries out of 3. Check DEBUG messages for more details.


        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.2          0.0          0.8         0.43             0.37   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.28               0.28          0.28              0.28  
{'attack_queries': ["race== 'Amer-Indian-Eskimo' & age>= 73 & gender== 'Female'"], 'baseline_queries': []}


## Evaluator - Linkability

In [21]:
evaluation = Evaluator(
    method="anonymeter-linkability",
    n_attacks=3,
    aux_cols=[
        ["age", "marital-status", "relationship", "race", "gender"],
        ["workclass", "education", "educational-num", "occupation"],
    ],
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.0          0.0         0.55         0.28             0.28   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.28               0.28          0.28              0.28  
{'attack_links': {}, 'baseline_links': {}, 'control_links': {}}


  self._sanity_check()


## Evaluator - Inference

In [None]:
evaluation = Evaluator(
    method="anonymeter-inference",
    n_attacks=3,
    secret="income",
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["details"])

        risk  risk_CI_btm  risk_CI_top  attack_rate  attack_rate_err  \
result   0.0          0.0          0.9         0.28             0.28   

        baseline_rate  baseline_rate_err  control_rate  control_rate_err  
result           0.43               0.37          0.57              0.37  
None


  self._sanity_check()


## Evaluator - Diagnostic

In [None]:
evaluation = Evaluator(method="sdmetrics-diagnosticreport")
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 1134.99it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 1388.38it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

        Score  Data Validity  Data Structure
result    1.0            NaN             NaN
          Property             Metric  Score
age  Data Validity  BoundaryAdherence    1.0
          Property             Metric  Score
age  Data Validity  BoundaryAdherence    1.0


## Evaluator - Quality

In [None]:
evaluation = Evaluator(method="sdmetrics-qualityreport")
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))
print(eval_result["columnwise"].head(1))
print(eval_result["pairwise"].head(1))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 145.69it/s]|
Column Shapes Score: 94.23%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 409.22it/s]|
Column Pair Trends Score: 60.21%

Overall Score (Average): 77.22%

        Score  Column Shapes  Column Pair Trends
result   0.77            NaN                 NaN
          Property        Metric     Score
age  Column Shapes  KSComplement  0.961759
                         Property                 Metric     Score  \
age workclass  Column Pair Trends  ContingencySimilarity  0.859197   

               Real Correlation  Synthetic Correlation Error  
age workclass               NaN                    NaN  None  


In [None]:
evaluation = Evaluator(
    method="mlutility-classification",
    target="income",
)
evaluation.create()

eval_result = evaluation.eval(
    data={
        "ori": train_data,
        "control": test_data,
        "syn": satisfy_data,
    }
)

print(eval_result["global"].head(1))

   ori_mean  ori_std  syn_mean  syn_std  diff
0      0.86     0.01      0.77      0.0 -0.09


## Evaluator - Classification

# Describer

In [None]:
from petsard.evaluator import Describer


desc = Describer(method="default")
desc.create()

desc_result = desc.eval(
    data={
        "data": satisfy_data,
    }
)

print(desc_result["global"].head(1))
print(desc_result["columnwise"].head(1))
print(desc_result["pairwise"].head(1))

   row_count  col_count  na_count
0      22795         15         0
     mean  median    std   min   max  kurtosis  skew     q1     q3  na_count  \
age  38.2   36.86  12.11  17.0  78.0     -0.44  0.47  28.52  46.49       0.0   

    nunique  
age    <NA>  
  column1 column2  corr
0     age     age   1.0


# Reporter

## Reporter - Save Data

In [None]:
from petsard.reporter import Reporter

rpt = Reporter(
    method="save_data",
    source="Postprocessor",
)
rpt.create({("Postprocessor", "exp1"): satisfy_data})
rpt.report()

Now is petsard_Postprocessor[exp1] save to csv...


## Reporter - Save Report

In [None]:
rpt = Reporter(
    method="save_report",
    granularity="global",
)
rpt.create(
    {
        ("Evaluator", "eval1_[global]"): desc_result["global"],
    }
)
rpt.report()

Now is petsard[Report]_[global] save to csv...
