In [1]:
from bs4 import BeautifulSoup, Tag
from urllib import request
import json
from pathlib import Path
import os
import requests
import numpy as np

In [None]:
def get_content_url(url):
    page = request.urlopen(url)
    soup = BeautifulSoup(page, "html.parser", from_encoding="utf-8")
    return soup

In [99]:
def get_content_panes(soup):
    panes = soup.find_all("div", class_="panes")
    return panes

In [100]:
url = "https://www.gdt.gov.vn/wps/portal/!ut/p/z1/tZJNc4IwEIb_Sj14ZLIhGOCI1BEUdZxOqeTChE9pIWCboR-_3qC9Wqdjm0N2dnbz7LuZFzG0Q0zwviq5rFrBa5VHjMbudumGyxAA7IkO_moW3gcOhQ0GFCKGWCpkJ_coKjN5l7ZC5kKO4Y3HKj_FnouEizH0yT4737FMVanOS17HWHGxHWNdN4k94Lq0ylBETZoWVKdakVLQDD7JtCTJbY1gkhkWBt1MdPR0TR9TZbhwHFDv2anFnTueYQYAljEH8I3pZu25Www--W74gREpDeZFDZ4S2Vf5O3oU7WujvvThlyt6VyfgGydcwZN_xdtwI36BWFm3ydms1fPhwBzlyMGFHxLt_sKSXdNY5FN7KVYzYkSL_mu61oYQOKPREevg9Pw!/dz/d5/L2dBISEvZ0FBIS9nQSEh/"
soup = get_content_url(url)
panes = get_content_panes(soup)

In [101]:
len(panes)

TypeError: 'numpy.ndarray' object is not callable

In [3]:
def get_file_and_metadata(url):
    """
    get file và tạo metadata cho file khi request url
    """
    prefix_url = "https://www.gdt.gov.vn"
    metadata_keys = ["documentName", "numberSymbol", "issueDate", "effectiveDate", "expirationDate", "signedBy", "replacedDocument", "issuingAuthority",
                     "documentStatus", "attachedFile"]
    page = request.urlopen(url)
    soup = BeautifulSoup(page, 'html.parser', from_encoding="utf-8")
    post_path = soup.find("a", class_="linkDown").get_attribute_list(key="href")[0]
    name = post_path.split("?MOD=")[0].split("/")[-1]
    response = requests.get(prefix_url+post_path)

    if response.status_code == 200:
        with open(f"thue_ttcn/{name}", "wb") as f:
            f.write(response.content)
            f.close()
        print("File downloaded successfully")
    else:
        print("Failed to download file")
        return
    
    table = soup.find("table", id="detail_legal")
    elements = table.find_all("td")
    metadata = {element1: element2.text.strip() for element1, element2 in zip(metadata_keys, elements[1::2])}
    
    with open(f"thue_ttcn/{name}.metadata.json", "w", encoding="utf-8") as f:
        json.dump(metadata, f, ensure_ascii=False, indent=4)
        f.close()
    print("Save metadata file successfully")
    print(f"Name: {name}\nMetadata: {json.dumps(metadata)}")

In [14]:
def print_descendant_texts(tag):
    """
    Get text and preprocessing đối với những url không có linkDown, hiển thị trực tiếp văn bản
    """
    for child in tag.children:
        if isinstance(child, str):
            if len(child.split()) > 0:
                print("Check: ", child.string.strip())
        elif isinstance(child, Tag):
            if child.name == "p" and child.children:
                for sub_child in child.children:
                    if len(sub_child.text.split()) == 0:
                        continue
                        # print("Check: ", sub_child.text, "end")
                    print("Check: "," ".join(sub_child.text.strip().split()))
            elif child.name == "style":
                continue
            else:
                print_descendant_texts(child)


In [78]:
def get_data_from_table_tag(table_tag):
    """
    Lấy dữ liệu từng cell trong table tag
    """
    # location = {}
    table_data = []
    # max_merged_cell = 0
    for row_id, row in enumerate(table_tag.find_all("tr")):
        row_data = []
        for col_id, data in enumerate(row.find_all("td")):
            if "colspan" in data.attrs.keys():
                merged_cell_num = data.attrs['colspan']
                # if int(merged_cell_num) > max_merged_cell:
                #     max_merged_cell = merged_cell_num
                #     location["row_index"] = row_id
                #     location["column_index"] = col_id
                for i in range(int(merged_cell_num)):
                    row_data.append(" ".join(data.text.replace("\n", "").split()))
            else:
                row_data.append(" ".join(data.text.replace("\n", "").split()))
        table_data.append(row_data)
    return np.array(table_data)

In [85]:
def convert_table_data_to_markdown_format(table_data):
    len_table_data = np.char.str_len(table_data)

    # max len cho từng column
    max_len_table_data = len_table_data.max(axis=0)

    # length lớn nhất của element đã được thêm khoảng trắng cho từng column
    max_spaces = max_len_table_data + 4
    
    content = """"""
    count = 1
    
    for row, len in zip(table_data, len_table_data):    
        content += "|"
        for element, len_element, max_len in zip(row, len, max_len_table_data):
            # Khoảng trắng cần thêm vào
            align_space = (max_len-len_element)/2

            if np.round(align_space) == align_space:
                left_space, right_space = int(align_space), int(align_space)
            else:
                left_space, right_space = int(align_space+1), int(align_space)
            
            content += " " * left_space + element + " " * right_space + "|"
        content += """\n"""
        
        if count == 1:
            content += "|"
            for max_len in max_len_table_data:
                content += ("-" * max_len) + "|"
            content += "\n"
        
        count += 1
    return content

In [86]:
table = soup.find("table", class_="MsoNormalTable")
table

<table border="1" cellpadding="0" cellspacing="0" class="MsoNormalTable" style="border-collapse: collapse; border: medium none; margin-left: 12.5pt">
<tr>
<td style="width: 46.6pt; border: 1pt solid blue; padding-left: 5.4pt; padding-right: 5.4pt; padding-top: 0in; padding-bottom: 0in" valign="top" width="62">
<p align="center" class="MsoNormal" style="TEXT-INDENT: 0in; TEXT-ALIGN: center">
<b><span style="font-family: Arial"><font size="2">Bậc</font></span></b></p></td>
<td colspan="2" style="width: 123.85pt; border-left: medium none; border-right: 1pt solid blue; border-top: 1pt solid blue; border-bottom: 1pt solid blue; padding-left: 5.4pt; padding-right: 5.4pt; padding-top: 0in; padding-bottom: 0in" valign="top" width="165">
<p align="center" class="MsoNormal" style="TEXT-INDENT: 0in; TEXT-ALIGN: center">
<b><span style="font-family: Arial"><font size="2">Thu nhập 
      bình quân</font></span></b></p>
<p align="center" class="MsoNormal" style="TEXT-INDENT: 0in; TEXT-ALIGN: center"

In [87]:
table_data = get_data_from_table_tag(table)
table_markdown_format = convert_table_data_to_markdown_format(table_data)

In [89]:
table_markdown_format

'|Bậc|Thu nhập bình quântháng/người|Thu nhập bình quântháng/người|Thuế suất| Số thuế phải nộp |\n|---|-----------------------------|-----------------------------|---------|------------------|\n| 1 |                             |          Đến 1.200          |    0    |         0        |\n| 2 |          Trên 1.200         |          đến 2.000          |   10%   | TNCT x 10% - 120 |\n| 3 |          Trên 2.000         |          đến 3.000          |   20%   | TNCT x 20% - 320 |\n| 4 |          Trên 3.000         |          đến 4.000          |   30%   | TNCT x 30% - 620 |\n| 5 |          Trên 4.000         |          đến 6.000          |   40%   |TNCT x 40% - 1.020|\n| 6 |          Trên 6.000         |          đến 8.000          |   50%   |TNCT x 50% - 1.620|\n| 7 |          Trên 8.000         |                             |   60%   |TNCT x 60% - 2.420|\n'

In [96]:
with open("preview.txt", "+a") as f:
    f.write(table_markdown_format+"!!!!")
    f.close()

In [97]:
url = "https://www.gdt.gov.vn/wps/portal?1dmy&mapping=home/documents/detail&urile=wcm%3apath%3a/GDT%20Content/sa_gdt/sa_vanban/vbhd/vbhd_tct/legal_100019_122739"
get_file_and_metadata(url)

AttributeError: 'NoneType' object has no attribute 'get_attribute_list'