# Environment setting
環境設定

In [1]:
import os
import requests
import sys
from pathlib import Path
from pprint import pprint
from typing import Optional

import pandas as pd


# determine branch, default is main
branch: str = '746-docs-split-high-cardinality-and-multi-table' # 'main'

# determine subfolder, default is None (petsard/demo/)
subfolder: Optional[str]  = 'best-practices'


# Check if running in Google Colab, if so, download the utils.py file from GitHub
is_colab: bool = 'COLAB_GPU' in os.environ
if is_colab:
    utils_url = f"https://raw.githubusercontent.com/nics-tw/petsard/{branch}/demo/utils.py"
    response = requests.get(utils_url)

    if response.status_code == 200:
        with open('utils.py', 'w') as f:
            f.write(response.text)

        Path('__init__.py').touch()
    else:
        raise RuntimeError(f"Failed to download utils.py. Status code: {response.status_code}")


# If not colab, and also contains subfolderl, add the correct path of util.py
else:
    if subfolder:
        sys.path.append(os.path.dirname(os.getcwd()))


In [2]:
import importlib

# Import the utils module
from utils import (
    get_yaml_path,
    setup_environment,
)


# Setup the environment
setup_environment(
    is_colab,
    branch,
    benchmark_data=[
        'best-practices_categorical_high-cardinality',
    ],
    subfolder='best-practices',
)

# Setup utils in best-practices
best_practices_utils = importlib.import_module("best-practices.best_practices_utils")

Looking in links: /var/folders/lb/lwjvbr314wj7bt1k63plw4bw0000gn/T/tmp4uw9j6v6
Obtaining file:///Users/justyn.chen/Library/CloudStorage/Dropbox/5_Career%20%E5%B7%A5%E4%BD%9C/20231016_NICS%20%E8%B3%87%E5%AE%89%E9%99%A2/3_%E5%B7%A5%E8%97%9D%EF%BC%9APETsARD/petsard
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: petsard
  Building editable for petsard (pyproject.toml): started
  Building editable for petsa

In [3]:
# Import PETsARD
from petsard import Executor

# Best practices: Categorical variables - Uniform encoding
最佳實踐：高基數變項 - 約束條件

In [4]:
students = pd.read_csv('benchmark/best-practices_categorical_high-cardinality.csv')


students_columns = {}
for column in students.columns:
    students_columns[column] = {}
    students_columns[column]['cardinality'] = students[column].nunique()
    students_columns[column]['dtype'] = str(students[column].dtype)

print("columns: ")
pprint(students_columns)

columns: 
{'admission_type': {'cardinality': 11, 'dtype': 'object'},
 'admission_type_code': {'cardinality': 11, 'dtype': 'object'},
 'birth_day': {'cardinality': 31, 'dtype': 'int64'},
 'birth_month': {'cardinality': 12, 'dtype': 'int64'},
 'birth_year': {'cardinality': 7, 'dtype': 'int64'},
 'college': {'cardinality': 4, 'dtype': 'object'},
 'college_code': {'cardinality': 6, 'dtype': 'object'},
 'department_code': {'cardinality': 24, 'dtype': 'int64'},
 'department_name': {'cardinality': 20, 'dtype': 'object'},
 'disabled_code': {'cardinality': 2, 'dtype': 'int64'},
 'disabled_type': {'cardinality': 2, 'dtype': 'object'},
 'identity': {'cardinality': 14, 'dtype': 'object'},
 'identity_code': {'cardinality': 14, 'dtype': 'int64'},
 'nationality': {'cardinality': 63, 'dtype': 'object'},
 'nationality_code': {'cardinality': 62, 'dtype': 'object'},
 'sex': {'cardinality': 2, 'dtype': 'object'},
 'university': {'cardinality': 2, 'dtype': 'object'},
 'university_code': {'cardinality': 2, 

In [5]:
students.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2005,2003,2002,2002,2000
birth_month,4,1,11,12,10
birth_day,9,16,7,24,7
zodiac,牡羊座,摩羯座,天蠍座,摩羯座,天秤座
university_code,2,1,1,1,1
university,國立政治大學,國立臺灣大學,國立臺灣大學,國立臺灣大學,國立臺灣大學
college_code,700,2000,1000,9000,1000
college,理學院,理學院,文學院,電機資訊學院,文學院
department_code,702,2080,1070,9020,1040
department_name,心理學系,地理環境資源學系,日本語文學系,資訊工程學系,哲學系


# YAML Configuration of Categorical variables for PETsARD
PETsARD 的類別變項 YAML 設定

In [6]:
yaml_file_case: str = 'categorical.yaml'

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
    subfolder='best-practices',
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/best-practices_categorical_high-cardinality.csv'
    column_types:
      category:
        - university_code
        - college_code
        - department_code
Preprocessor:
  encoding_uniform:
    sequence:
      - 'encoder'
    encoder:
      birth_year: 'encoding_uniform'
      birth_month: 'encoding_uniform'
      birth_day: 'encoding_uniform'
      zodiac: 'encoding_uniform'
      university_code: 'encoding_uniform'
      university: 'encoding_uniform'
      college_code: 'encoding_uniform'
      college: 'encoding_uniform'
      deparment_code: 'encoding_uniform'
      department_name: 'encoding_uniform'
      admission_type_code: 'encoding_uniform'
      admission_type: 'encoding_uniform'
      disabled_code: 'encoding_uniform'
      disabled_type: 'encoding_uniform'
      nationality_code: 'encoding_uniform'
      nationality: 'encoding_uniform'
      identity_code: 'encoding_uniform'
      identity: 'encoding_un

### Execution and Result
執行與結果

In [7]:
# Initialize and run executor
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Now is petsard_Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo] save to csv...


In [8]:
output = exec_case.get_result()[
    'Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Reporter[output]'
]['Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]'].copy()

output.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2003,2001,2005,2003,2003
birth_month,4,12,3,5,4
birth_day,10,7,23,29,19
zodiac,摩羯座,天蠍座,處女座,摩羯座,牡羊座
university_code,002,002,002,002,002
university,國立臺灣大學,國立政治大學,國立政治大學,國立臺灣大學,國立政治大學
college_code,2000,1000,2000,2000,1000
college,文學院,文學院,文學院,理學院,資訊學院
department_code,103,2020,702,1030,9010
department_name,戲劇學系,戲劇學系,資訊科學系,物理學系,歷史學系


# Invalid on Constraints
約束條件違反

## Birthday and Zodiac Sign: Field Constraints (`field_constraints`)
生日與星座：欄位約束 (`field_constraints`)

In [9]:
best_practices_utils.check_invalid(output, ['birthday'])

# 154 invalid birthday dates found

   index                                          reason  year  month  day
0    240  該年2月沒有29日 Day 29 does not exist in 2 that year  2003      2   29
1    446  該年9月沒有31日 Day 31 does not exist in 9 that year  2005      9   31
2    481  該年9月沒有31日 Day 31 does not exist in 9 that year  2003      9   31
3    510  該年2月沒有29日 Day 29 does not exist in 2 that year  2003      2   29
4    573  該年2月沒有29日 Day 29 does not exist in 2 that year  2005      2   29

Counts by reason:
reason
該年11月沒有31日 Day 31 does not exist in 11 that year    17
該年2月沒有29日 Day 29 does not exist in 2 that year      27
該年2月沒有30日 Day 30 does not exist in 2 that year      33
該年2月沒有31日 Day 31 does not exist in 2 that year      21
該年4月沒有31日 Day 31 does not exist in 4 that year      22
該年6月沒有31日 Day 31 does not exist in 6 that year      18
該年9月沒有31日 Day 31 does not exist in 9 that year      16
Name: index, dtype: int64

Total invalid records: 154
Unique invalid records: 154


In [10]:
best_practices_utils.check_invalid(output, ['zodiac'])

# 9156 invalid zodiac signs found

   index                      reason  month  day zodiac expected_zodiac
0      0  星座與出生日期不匹配 Zodiac mismatch      4   10    摩羯座             牡羊座
1      1  星座與出生日期不匹配 Zodiac mismatch     12    7    天蠍座             射手座
2      2  星座與出生日期不匹配 Zodiac mismatch      3   23    處女座             牡羊座
3      3  星座與出生日期不匹配 Zodiac mismatch      5   29    摩羯座             雙子座
4      5  星座與出生日期不匹配 Zodiac mismatch      3    9    射手座             雙魚座

Counts by reason:
reason
星座與出生日期不匹配 Zodiac mismatch    9156
Name: index, dtype: int64

Total invalid records: 9156
Unique invalid records: 9156


## University, College, and Department: Field Combination Constraints (`field_combinations`)
大學、學院、與系所：欄位組合約束 (`field_combinations`)

In [11]:
best_practices_utils.check_invalid(output, ['university'])

# 3163 invalid universities found

   index university_code university  \
0      0             002     國立臺灣大學   
1      3             002     國立臺灣大學   
2      5             002     國立臺灣大學   
3      6             002     國立臺灣大學   
4     11             002     國立臺灣大學   

                                              reason  
0  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
1  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
2  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
3  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  
4  大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and ...  

Counts by reason:
reason
大學代碼與名稱不符，大學代碼應對應 國立政治大學 (University code and name mismatch)    3163
Name: index, dtype: int64

Total invalid records: 3163
Unique invalid records: 3163


In [12]:
best_practices_utils.check_invalid(output, ['college'])

# 8213 invalid university-college relationships found

   index university_code college_code  \
0      0             002         2000   
1      1             002         1000   
2      2             002         2000   
3      3             002         2000   
4      4             002         1000   

                                              reason  
0  學院 2000 不屬於大學 002 (College does not belong to ...  
1  學院 1000 不屬於大學 002 (College does not belong to ...  
2  學院 2000 不屬於大學 002 (College does not belong to ...  
3  學院 2000 不屬於大學 002 (College does not belong to ...  
4  學院 1000 不屬於大學 002 (College does not belong to ...  

Counts by reason:
reason
學院 100 不屬於大學 001 (College does not belong to university)       90
學院 1000 不屬於大學 002 (College does not belong to university)     670
學院 2000 不屬於大學 002 (College does not belong to university)    7041
學院 9000 不屬於大學 002 (College does not belong to university)     396
學院 ZA0 不屬於大學 001 (College does not belong to university)       16
Name: index,

In [13]:
best_practices_utils.check_invalid(output, ['department'])

# 8314 invalid college-department relationships found

   index college_code department_code  \
0      0         2000             103   
1      1         1000            2020   
2      2         2000             702   
3      3         2000            1030   
4      4         1000            9010   

                                              reason  
0  系所 103 不屬於該學院 (Department does not belong to c...  
1  系所 2020 不屬於該學院 (Department does not belong to ...  
2  系所 702 不屬於該學院 (Department does not belong to c...  
3  系所 1030 不屬於該學院 (Department does not belong to ...  
4  系所 9010 不屬於該學院 (Department does not belong to ...  

Counts by reason:
reason
系所 101 不屬於該學院 (Department does not belong to college)      360
系所 1010 不屬於該學院 (Department does not belong to college)     165
系所 102 不屬於該學院 (Department does not belong to college)      374
系所 1020 不屬於該學院 (Department does not belong to college)     188
系所 103 不屬於該學院 (Department does not belong to college)      355
系所 1030 不屬於該學院 (Department 

## Nationality and Nationality Code: Missing Value Group Constraints (`nan_group`)
國籍與國籍代碼：遺失值群組約束 (`nan_group`)

In [14]:
best_practices_utils.check_invalid(output, ['nationality'])

# 1332 invalid nationalities found

   index nationality_code nationality  \
0      7              403        中華民國   
1     12              503        中華民國   
2     22              113        中華民國   
3     23              113        中華民國   
4     35              113        中華民國   

                                              reason  
0  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
1  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
2  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
3  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  
4  中華民國國籍不應有國籍代碼 (ROC nationality should not have...  

Counts by reason:
reason
中華民國國籍不應有國籍代碼 (ROC nationality should not have nationality code)    1332
Name: index, dtype: int64

Total invalid records: 1332
Unique invalid records: 1332


# YAML Configuration for PETsARD
PETsARD 的 YAML 設定

In [18]:
yaml_file_case: str = 'high-cardinality.yaml'

yaml_path_case: str = get_yaml_path(
    is_colab=is_colab,
    yaml_file=yaml_file_case,
    branch=branch,
    subfolder='best-practices',
)

Configuration content:
---
Loader:
  data:
    filepath: 'benchmark/best-practices_categorical_high-cardinality.csv'
    column_types:
      category:
        - university_code
        - college_code
        - department_code
Preprocessor:
  encoding_uniform:
    sequence:
      - 'encoder'
    encoder:
      birth_year: 'encoding_uniform'
      birth_month: 'encoding_uniform'
      birth_day: 'encoding_uniform'
      zodiac: 'encoding_uniform'
      university_code: 'encoding_uniform'
      university: 'encoding_uniform'
      college_code: 'encoding_uniform'
      college: 'encoding_uniform'
      deparment_code: 'encoding_uniform'
      department_name: 'encoding_uniform'
      admission_type_code: 'encoding_uniform'
      admission_type: 'encoding_uniform'
      disabled_code: 'encoding_uniform'
      disabled_type: 'encoding_uniform'
      nationality_code: 'encoding_uniform'
      nationality: 'encoding_uniform'
      identity_code: 'encoding_uniform'
      identity: 'encoding_un

### Execution and Result
執行與結果

In [19]:
# Initialize and run executor
exec_case = Executor(config=yaml_path_case)
exec_case.run()

Trial 10: Got 20 rows, need 9980 more
Trial 20: Got 31 rows, need 9969 more
Trial 30: Got 43 rows, need 9957 more
Trial 40: Got 49 rows, need 9951 more
Trial 50: Got 59 rows, need 9941 more
Trial 60: Got 64 rows, need 9936 more
Trial 70: Got 73 rows, need 9927 more
Trial 80: Got 83 rows, need 9917 more
Trial 90: Got 88 rows, need 9912 more
Trial 100: Got 97 rows, need 9903 more
Trial 110: Got 107 rows, need 9893 more
Trial 120: Got 117 rows, need 9883 more
Trial 130: Got 126 rows, need 9874 more
Trial 140: Got 134 rows, need 9866 more
Trial 150: Got 144 rows, need 9856 more
Trial 160: Got 152 rows, need 9848 more
Trial 170: Got 161 rows, need 9839 more
Trial 180: Got 166 rows, need 9834 more
Trial 190: Got 174 rows, need 9826 more
Trial 200: Got 186 rows, need 9814 more
Trial 210: Got 199 rows, need 9801 more
Trial 220: Got 213 rows, need 9787 more
Trial 230: Got 225 rows, need 9775 more
Trial 240: Got 235 rows, need 9765 more
Trial 250: Got 242 rows, need 9758 more
Trial 260: Got 257 



In [20]:
output_w_const = exec_case.get_result()[
    'Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]_Reporter[output]'
]['Loader[data]_Preprocessor[encoding_uniform]_Synthesizer[demo]_Postprocessor[demo]_Constrainer[demo]'].copy()

output_w_const.head(5).T

Unnamed: 0,0,1,2,3,4
birth_year,2001,2002,2003,2004,2003
birth_month,8,4,5,4,9
birth_day,6,7,24,7,8
zodiac,獅子座,牡羊座,雙子座,牡羊座,處女座
university_code,002,002,002,002,002
university,國立政治大學,國立政治大學,國立政治大學,國立政治大學,國立政治大學
college_code,ZA0,ZA0,ZA0,ZA0,ZA0
college,資訊學院,資訊學院,資訊學院,資訊學院,資訊學院
department_code,703,703,703,703,703
department_name,資訊科學系,資訊科學系,資訊科學系,資訊科學系,資訊科學系


# Invalid on Constraints: After setting Constraints
約束條件違反：在設定約束條件之後

In [None]:
best_practices_utils.check_invalid(output_w_const)

# No invalid departments found

# No invalid birthday dates found

# No invalid zodiac signs found

# No invalid university-college relationships found

# No invalid nationalities found

# No invalid universities found

# 704 invalid college-department relationships found

   index college_code department_code  \
0      1          ZA0             701   
1      2         2000            1010   
2      5         2000            1010   
3      8         2000             104   
4     12         1000            2040   

                                              reason  
0  系所 701 不屬於該學院 (Department does not belong to c...  
1  系所 1010 不屬於該學院 (Department does not belong to ...  
2  系所 1010 不屬於該學院 (Department does not belong to ...  
3  系所 104 不屬於該學院 (Department does not belong to c...  
4  系所 2040 不屬於該學院 (Department does not belong to ...  

Counts by reason:
reason
系所 101 不屬於該學院 (Department does not belong to college)      27
系所 1010 不屬於該學院 (Department does not belong to college)    173