In [1]:
import pandas as pd
import re
import json

In [2]:
with open('output_file_2.txt', 'r', encoding='utf-8') as file:
    data = [json.loads(line.strip()) for line in file]
df = pd.DataFrame(data)

In [3]:
def create_section_df(df):
    # Select columns: 'ensection', 'ensection_note', 'kosection_note'
    df = df.loc[:, ['enchapter','enheading', 'enheading_notes', 'koheading_notes']]
    # Drop duplicate rows across all columns
    df = df.drop_duplicates()
    return df

# Loaded variable 'df' from URI: d:\Onedrive\OneDrive - UNI CONSULTING\Attachments\Code\uni_craw_tvpl\uni_craw_section_chapter_header\Craw_ko_en\ko_en_section_chapter_heading_des_notes.xlsx

heading_df = create_section_df(df.copy())

In [4]:
from bs4 import BeautifulSoup

def fix_atag(html):
    # Phân tích cú pháp HTML
    soup = BeautifulSoup(html, 'html.parser')

    # Duyệt qua tất cả thẻ <a>
    for a_tag in soup.find_all('a'):
        # Kiểm tra nếu thẻ <a> chứa <img>
        if a_tag.find('img'):
            attributes_to_remove = ['rel', 'href', 'title']
            for attr in attributes_to_remove:
                if attr in a_tag.attrs:  
                    del a_tag[attr]
                if 'alt' in a_tag.find('img').attrs:  
                    del a_tag.find('img')['alt']
            
        else:
            # Thay thẻ <a> bằng text bên trong nó
            a_tag.replace_with(a_tag.text)

    # Kết quả sau khi xử lý
    result = str(soup).replace('"/clip/','"https://unipass.customs.go.kr/clip/')
    # print(result)
    return result


In [5]:
def process_section_notes(df,en_col,ko_col):
    
    def split_text(text):
        if not isinstance(text, str):
            return []
        text = text.replace('\n \n', '\n\n')
        text = text.replace('_x000D__x000D_', '\n')
        # Thay thế 3 hoặc nhiều '\n' thành '\n\n'
        text = re.sub(r'\n{2,}', '\n', text)
        delimiter = '\n'
        parts = text.split(delimiter)
        # for part in parts:
            # print(part) 
        return parts
    
    def get_heading_note_index(parts):
        pattern = r"^\d{4}\.\d{2}\s*-*"
        matches = [i for i, text in enumerate(parts) if re.match(pattern, text)]
        # Lấy index lớn nhất nếu có
        if matches:
            largest_index = max(matches)
            # print(f"Vị trí có index lớn nhất là: {largest_index}, chuỗi: {parts[largest_index]}")
            return largest_index
        else:
            # print("Không tìm thấy chuỗi phù hợp với pattern.")
            return 1

    def process_notes(x):
        split = split_text(x)
        note_idx = get_heading_note_index(split)
        if note_idx == 1:
            return split[1:]
        else :
            return split[note_idx+1:]    
        
    import regex


    df = df.astype({'enheading': 'string'})
    def derivedCol(enheading):

        findall1 = regex.findall(r"\p{N}", enheading)
        if len(findall1) == 4:
            return enheading[:2] + "." + enheading[2:]
        if len(findall1) == 3:
            return enheading[1:2] + enheading[:1] + "." + enheading[1:]
        return None



    # Áp dụng hàm
    df['en_des_temp'] = df[en_col].apply(lambda x: split_text(x)[0])
    df['ko_des_temp'] = df[ko_col].apply(lambda x: split_text(x)[0])
    
    df.insert(1, "hs", df.apply(lambda row : derivedCol(row["enheading"]), axis=1))
    df.insert(2, "en_description", df.apply(lambda row : row["en_des_temp"].strip()[row["en_des_temp"].strip().find(" ") + 3:], axis=1))
    df.insert(3, "ko_description", df.apply(lambda row : row["ko_des_temp"].strip()[row["ko_des_temp"].strip().find(" ") + 3:], axis=1))
    
    df['en_notes'] = df[en_col].apply(process_notes)
    df['ko_notes'] = df[ko_col].apply(process_notes)
    df['nest_parent'] = df['enchapter']
    
    
    return df


In [7]:
# Gọi hàm xử lý
heading_df_2 = process_section_notes(heading_df, 'enheading_notes', 'koheading_notes')


In [9]:
heading_df_2 = heading_df_2.drop(columns=['enchapter','enheading','enheading_notes', 'koheading_notes','en_des_temp','ko_des_temp'])


In [10]:
heading_df_2['ko_notes'] = heading_df_2['ko_notes'].apply(lambda x: [fix_atag(html) for html in x])
heading_df_2['en_notes'] = heading_df_2['en_notes'].apply(lambda x: [fix_atag(html) for html in x])

  soup = BeautifulSoup(html, 'html.parser')


In [12]:
heading_df_2.head(2)

Unnamed: 0,hs,en_description,ko_description,en_notes,ko_notes,nest_parent
0,1.01,"Live horses, asses, mules and hinnies (+).",살아 있는 말ㆍ당나귀ㆍ노새ㆍ버새(+),"[This heading covers horses (mares, stallions,...",[이 호에는 말(암컷의 말․번식용 말․거세한 말․새끼말․조랑말)․당나귀․노새와 버새...,1
1,1.02,Live bovine animals (+).,살아 있는 소(+),"[(1) Cattle :, This category covers bovine ...","[(1) 축우(畜牛 : cattle), 이 범주에는 보스(Bos)속의 소과 동...",1


In [11]:
heading_df_2.to_excel("ko_en_heading_notes_2024.xlsx",index = False)