In [8]:
import pandas as pd
import re
import json

In [9]:
with open('output_file_2.txt', 'r', encoding='utf-8') as file:
    data = [json.loads(line.strip()) for line in file]
df = pd.DataFrame(data)

In [10]:
def create_chapter_df(df):
    # Select columns: 'enchapter', 'enchapter_notes', 'kochapter_notes'
    df = df.loc[:, ['ensection','enchapter', 'enchapter_notes', 'kochapter_notes']]
    # Drop duplicate rows across all columns
    df = df.drop_duplicates()
    return df

chapter_df = create_chapter_df(df.copy())

In [11]:
from bs4 import BeautifulSoup

def fix_atag(html):
    # Phân tích cú pháp HTML
    soup = BeautifulSoup(html, 'html.parser')

    # Duyệt qua tất cả thẻ <a>
    for a_tag in soup.find_all('a'):
        # Kiểm tra nếu thẻ <a> chứa <img>
        if a_tag.find('img'):
            attributes_to_remove = ['rel', 'href', 'title']
            for attr in attributes_to_remove:
                if attr in a_tag.attrs:  
                    del a_tag[attr]
                if 'alt' in a_tag.find('img').attrs:  
                    del a_tag.find('img')['alt']
            
        else:
            # Thay thẻ <a> bằng text bên trong nó
            a_tag.replace_with(a_tag.text)

    # Kết quả sau khi xử lý
    result = str(soup).replace('"/clip/','"https://unipass.customs.go.kr/clip/')
    # print(result)
    return result


In [12]:
def process_chapter_notes(df,en_col,ko_col):
    
    def split_text(text):
        if not isinstance(text, str):
            return []
        text = text.replace('\n \n', '\n\n')
        text = text.replace('_x000D__x000D_', '\n')
        # Thay thế 3 hoặc nhiều '\n' thành '\n\n'
        text = re.sub(r'\n{2,}', '\n', text)
        delimiter = '\n'
        parts = text.split(delimiter)
        # print(parts)
        return parts
    
    def get_note_index(parts):
        note_pattern_en = r'^Notes?\.$'
        note_pattern_ko = r'^주:\s*'
        # Tìm các vị trí phần tử khớp với regex
        note_index = next((i for i, item in enumerate(parts) if re.fullmatch(note_pattern_en, item.replace('\n','').strip()) or re.fullmatch(note_pattern_ko, item.replace('\n','').strip())), None)
        # print("note_index: ",note_index)
        return note_index
    
    def get_general_index(parts):
        general_pattern_en = r'^GENERAL\.*$'
        general_pattern_ko = r'^총설\s*$'
        general_index = next((i for i, item in enumerate(parts) if re.fullmatch(general_pattern_en, item.replace('\n','').strip())or re.fullmatch(general_pattern_ko, item.replace('\n','').strip())), None)
        # print('general_index: ',general_index)
        return general_index
    
    def process_notes(x):
        split = split_text(x)
        note_idx = get_note_index(split)
        general_idx = get_general_index(split)
        if note_idx is None:
            return []
        elif general_idx is not None:
            return split[note_idx:general_idx]
        else:
            return split[note_idx:]
    
    def process_general(x):
        split = split_text(x)
        general_idx = get_general_index(split)
        if general_idx is None:
            return []
        elif general_idx is not None:
            return split[general_idx:]
        
    def process_des(x):
        split = split_text(x)
        note_idx = get_note_index(split)
        if note_idx is None:
            return split[1:]
        elif note_idx is not None:
            return split[1:note_idx]
        
    def int_to_roman(num):
        num = int(num)
        roman_numerals = [
            ("M", 1000), ("CM", 900), ("D", 500), ("CD", 400),
            ("C", 100), ("XC", 90), ("L", 50), ("XL", 40),
            ("X", 10), ("IX", 9), ("V", 5), ("IV", 4),
            ("I", 1)
        ]
        
        result = ""
        for roman, value in roman_numerals:
            while num >= value:
                result += roman
                num -= value
        return result

    # Áp dụng hàm
    df['chapter_name'] = df[en_col].apply(lambda x: split_text(x)[0])
    df['en_description'] = df[en_col].apply(process_des)
    df['ko_description'] = df[ko_col].apply(process_des)
    
    df['en_notes'] = df[en_col].apply(process_notes)
    
    df['ko_notes'] = df[ko_col].apply(process_notes)
    
    df['en_general'] = df[en_col].apply(process_general)
    df['ko_general'] = df[ko_col].apply(process_general)
    df['_nest_parent_'] = df['ensection'].apply(int_to_roman)
    df.insert(5, "hs", df["enchapter"])
    
    return df


In [13]:
# Gọi hàm xử lý
chapter_df = process_chapter_notes(chapter_df, 'enchapter_notes', 'kochapter_notes')

In [14]:
chapter_df = chapter_df.drop(columns=['ensection','enchapter_notes', 'kochapter_notes'])

In [15]:
chapter_df['ko_notes'] = chapter_df['ko_notes'].apply(lambda x: [fix_atag(html) for html in x])
chapter_df['ko_general'] = chapter_df['ko_general'].apply(lambda x: [fix_atag(html) for html in x])
chapter_df['en_notes'] = chapter_df['en_notes'].apply(lambda x: [fix_atag(html) for html in x])
chapter_df['en_general'] = chapter_df['en_general'].apply(lambda x: [fix_atag(html) for html in x])

  soup = BeautifulSoup(html, 'html.parser')


In [17]:
chapter_df.head(2)

Unnamed: 0,enchapter,chapter_name,hs,en_description,ko_description,en_notes,ko_notes,en_general,ko_general,_nest_parent_
0,1,Chapter 1,1,[Live animals],[살아 있는 동물],"[Note. , 1.- This Chapter covers all live anim...","[주: , 1. 이 류에는 다음 각 목의 것을 제외한 모든 살아 있는 동물이 포함된...","[GENERAL, This Chapter covers all living creat...","[총설, 이 류에는 다음의 것을 제외한 모든 살아 있는 동물(식용이나 그 밖의 용도...",I
6,2,Chapter 2,2,[Meat and edible meat offal ],[육과 식용 설육(屑肉)],"[Note., 1.- This Chapter does not cover :, (a...","[주: , 1. 이 류에서 다음 각 목의 것은 제외한다., 가. 제0201호부터 제...","[GENERAL, This Chapter applies to meat in carc...","[총설, 이 류에는 식용에 적합한 모든 동물[제3류의 어류․갑각류․연체동물․그 밖의...",I


In [16]:
chapter_df.to_excel("ko_en_chapter_notes_2024.xlsx",index = False)