In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
import re
import requests
from tqdm import tqdm

- Your code will go over each index html file in the BoPhapDienDienTu/demuc directory, and open and save the html documents linked from the index page, and save them to the BoPhapDienDienTu/vbpl directory in the form full_ItemID.html. 
	- For example for the document at "https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID=148955&Keyword=", your code will save it as full_148955.html
- Also save the property page for each document in the BoPhapDienDienTu/property directory in the form p_ItemID.html. 
	- For example, for the same document above, the property page is at "https://vbpl.vn/tw/Pages/vbpq-thuoctinh.aspx?dvid=13&ItemID=148955&Keyword=", your code will save it as p_148955.html
- Also save the history page for each document in BoPhapDienDienTu/history. 
	- For example, for the same document above, the history page is at "https://vbpl.vn/tw/Pages/vbpq-lichsu.aspx?dvid=13&ItemID=148955&Keyword=", your code will save it as h_148955.html
- Also save the related document page for each document in BoPhapDienDienTu/related. 
	- For example, for the same document above, the related document page is at https://vbpl.vn/TW/Pages/vbpq-vanbanlienquan.aspx?ItemID=148955&Keyword=, your code will save it as r_148955.html
- Also save the PDF for each document in BoPhapDienDienTu/pdf. 
	- For example, for the same document above, the PDF file is at https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=148955, your code will save it as pdf_148955.pdf

In [None]:
file_paths = []
folder_path = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\demuc"

for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        # Construct full file path
        html_file_path = os.path.join(root, file_name)
        if html_file_path.endswith(".html"):
            file_paths.append(html_file_path)
print(len(file_paths))

all_links = []            
for html_file_path in tqdm(file_paths, desc="Processing HTML files"):
    with open(html_file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all <a> tags
    anchors = soup.find_all('a')

    # Extract the hyperlinks from each <a> tag
    http_hrefs = [
        anchor.get('href') 
        for anchor in anchors 
        if anchor.get('href') and anchor.get('href').startswith("http") and ("ItemID=" in anchor.get('href'))
    ]
    
    # Use regex to filter and deduplicate links
    deduped_links = list(set([
        re.sub(r"(&.*$)|(#.*)?$", "", link)
        for link in http_hrefs
    ]))
    all_links.extend(deduped_links)

In [None]:
all_links = list(set(all_links))
print(len(all_links))

# Save the links to a text file
with open("vbpl_links.txt", "w", encoding="utf-8") as file:
	for link in all_links:
		file.write(link + "\n")

### Retrieve HTML documents

In [2]:
driver = webdriver.Chrome()
driver.maximize_window()
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [3]:
f_url = "https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?"
property_url = "https://vbpl.vn/tw/Pages/vbpq-thuoctinh.aspx?dvid=13&"
history_url = "https://vbpl.vn/tw/Pages/vbpq-lichsu.aspx?dvid=13&"
related_url = "https://vbpl.vn/TW/Pages/vbpq-vanbanlienquan.aspx?"
pdf_url = "https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?"

In [5]:
# Read the links from the file and extract IDs
with open("vbpl_links.txt", "r", encoding="utf-8") as file:
    all_links = file.read().splitlines()
    all_ids = [link.split("ItemID=")[1] for link in all_links]

In [None]:
def retrieval_tool(url, option, id_list=all_ids, output_url="p"):
    timed_out_ids = []       # deal with network errors
    different_url_ids = []	 # deal with redirections
    for id in tqdm(id_list, desc=f"Retrieving {option} HTML"):
        full_url = f"{url}ItemID={id}"
        
        # Check for redirection using requests
        try:
            response = requests.get(full_url, allow_redirects=True, timeout=10)
        except Exception as e:
            print(f"Error fetching {full_url}: {e}")
            timed_out_ids.append(id)
            continue
        
        # If the final URL after redirection is different, skip this ID
        if response.url != full_url:
            different_url_ids.append(id)
            continue
        
        # If no redirect is detected, proceed with Selenium scraping
        driver.get(full_url)
        time.sleep(random.uniform(0.5, 1))
        html_content = driver.page_source
        
        # Save the HTML content to a file
        file_path = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\{option}\{output_url}_{id}.html"
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(html_content)
            
    return timed_out_ids, different_url_ids

# p_time_out_ids = retrieval_tool(url=property_url, option="property", output_url="p")[0]
# print(len(p_time_out_ids))
# h_time_out_ids =  retrieval_tool(url=history_url, option="history", output_url="h")[0]
# print(len(h_time_out_ids))
# r_time_out_ids = retrieval_tool(url=related_url, option="related", output_url="r")[0]
# print(len(r_time_out_ids))
# # retrieval_tool(url=f_url, option="full", output_url="full")[0]  # already loaded

Retrieving related HTML: 100%|██████████| 5956/5956 [5:46:54<00:00,  3.49s/it]   

0





After scraping around 700 pages, I realized that sometimes, the URLs to the full page redirects to a different page (the 'van-ban-goc' or PDF page instead). I deleted these instances, since the full page did not seem to exist. This also prompted me to add a check for hyperlink redirection.

In [5]:
wrong = []
with open("vbpl_links.txt", "r", encoding="utf-8") as file:
    all_links = file.read().splitlines()
    all_ids = [link.split("ItemID=")[1] for link in all_links]

# Loop through the IDs and process each
for id in tqdm(all_ids[:694], desc="Retrieving HTML documents"):
    full_url = f"{f_url}ItemID={id}"
    
    # Check for redirection using requests
    try:
        response = requests.get(full_url, allow_redirects=True, timeout=10)
    except Exception as e:
        print(f"Error fetching {full_url}: {e}")
        continue
    
    if response.url != full_url:
        wrong.append(rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_{id}.html")

print(len(wrong))
    
for file_path in wrong:
    try:
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        else:
            print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error deleting {file_path}: {e}")


Retrieving HTML documents:  66%|██████▋   | 460/694 [19:32<19:10,  4.92s/it]

Error fetching https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID=106063: HTTPSConnectionPool(host='vbpl.vn', port=443): Read timed out. (read timeout=10)


Retrieving HTML documents:  66%|██████▋   | 461/694 [19:42<26:00,  6.70s/it]

Error fetching https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID=129642: HTTPSConnectionPool(host='vbpl.vn', port=443): Read timed out. (read timeout=10)


Retrieving HTML documents: 100%|██████████| 694/694 [29:20<00:00,  2.54s/it]

107
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_144824.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_134555.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_139899.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_119137.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_134205.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_151083.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_151826.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_136138.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_120916.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_133858.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_18142.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_110915.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_146048.html
Deleted: 




In [61]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm

# Define the output directory and ensure it exists.
output_dir = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\pdf"
os.makedirs(output_dir, exist_ok=True)

def extract_pdf_link(page_url, _id, timeout=20):
    """
    Fetches the HTML page at page_url, then searches for an embedded PDF link.
    Checks for attributes: "original-url", "original_url", "src", or "href".
    Returns the first found PDF URL or None if not found.
    """
    try:
        response = requests.get(page_url, timeout=timeout)
    except Exception as e:
        print(f"Error fetching {page_url}: {e}")
        return None

    if response.status_code != 200:
        print(f"Non-200 response from {page_url}: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, "html.parser")
    check = soup.find("li", {"class": "red"}).text
    if "Hết hiệu lực toàn bộ" in check:
        return None

    url_tail = str(soup.find("li", {"class": "green"}).text.strip().split(" ")[-1]).replace("/", "_")
    if "NĐ-CP" in url_tail:
        url_tail = url_tail.replace("NĐ-CP", "N%C4%90-CP")  # if possible, change to the URL-encoded form
    print(f"https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/{_id}/VanBanGoc_{url_tail}.pdf")
    return f"https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/{_id}/VanBanGoc_{url_tail}.pdf"

def download_pdf_by_id_list(_id, base_url, option="pdf", timeout=20):
    """
    For each ID, constructs a page URL using base_url and the ID,
    scrapes the page for the embedded PDF link, then downloads the PDF.
    
    Returns a tuple:
      (timed_out_ids, not_found_ids, fail_downloads)
    """
    page_url = f"{base_url}ItemID={_id}"
    print(f"\nProcessing ID {_id}: {page_url}")
    
    # Scrape the page to extract the PDF link.
    pdf_link = extract_pdf_link(page_url, _id, timeout=timeout)
    if not pdf_link:
        not_found_ids.append(_id)
        return
    
    # Attempt to download the PDF.
    try:
        pdf_response = requests.get(pdf_link, allow_redirects=True, timeout=timeout)
    except Exception as e:
        print(f"Error downloading PDF from {pdf_link} for ID {_id}: {e}")
        timed_out_ids.append(_id)
        return

    if pdf_response.status_code == 200:
        # Derive a filename from the PDF link.
        filename = os.path.basename(pdf_link)
        if not filename.lower().endswith(".pdf"):
            filename = f"{option}_{_id}.pdf"
        filepath = os.path.join(output_dir, filename)
        try:
            with open(filepath, "wb") as f:
                f.write(pdf_response.content)
            print(f"Downloaded PDF for ID {_id} to {filepath}")
        except Exception as e:
            print(f"Error saving PDF for ID {_id} to {filepath}: {e}")
            fail_downloads.append(_id)
    else:
        print(f"Failed to download PDF for ID {_id}: status code {pdf_response.status_code}")
        fail_downloads.append(_id)

timed_out_ids = []  # IDs that timed out or had network errors.
not_found_ids = []  # IDs for which no PDF link was found.
fail_downloads = [] # IDs for which the PDF download failed.

# Example all_ids and pdf_url should be defined before this block.
for id in tqdm(all_ids[:10], desc="processing ids"):
    download_pdf_by_id_list(id, pdf_url, option="pdf")

print("\nSummary:")
print(f"IDs with no PDF found: {len(not_found_ids)}")
print(f"Timed out IDs: {len(timed_out_ids)}")
print(f"Download failures: {len(fail_downloads)}")


processing ids:   0%|          | 0/10 [00:00<?, ?it/s]


Processing ID 130588: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=130588


processing ids:  10%|█         | 1/10 [00:01<00:15,  1.76s/it]


Processing ID 94078: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=94078
https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/94078/VanBanGoc_71_2015_TT-BGTVT.pdf


processing ids:  20%|██        | 2/10 [00:04<00:19,  2.38s/it]

Failed to download PDF for ID 94078: status code 404

Processing ID 37479: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=37479
https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/37479/VanBanGoc_11_2014_TT-BTTTT.pdf


processing ids:  30%|███       | 3/10 [00:09<00:24,  3.50s/it]

Downloaded PDF for ID 37479 to D:\PhapDien_semantic_search\BoPhapDienDienTu\pdf\VanBanGoc_11_2014_TT-BTTTT.pdf

Processing ID 43494: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=43494
https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/43494/VanBanGoc_12_2013_TT-BTNMT.pdf


processing ids:  40%|████      | 4/10 [00:12<00:19,  3.19s/it]

Failed to download PDF for ID 43494: status code 404

Processing ID 30363: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=30363
https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/30363/VanBanGoc_86_2012_N%C4%90-CP.pdf


processing ids:  50%|█████     | 5/10 [00:17<00:19,  3.96s/it]

Downloaded PDF for ID 30363 to D:\PhapDien_semantic_search\BoPhapDienDienTu\pdf\VanBanGoc_86_2012_N%C4%90-CP.pdf

Processing ID 15967: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=15967
https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/15967/VanBanGoc_20_2005_QĐ-BGTVT.pdf


processing ids:  60%|██████    | 6/10 [00:20<00:14,  3.55s/it]

Failed to download PDF for ID 15967: status code 404

Processing ID 144824: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=144824


processing ids:  70%|███████   | 7/10 [00:21<00:08,  2.91s/it]


Processing ID 134555: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=134555
https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/134555/VanBanGoc_27_2019_N%C4%90-CP.pdf


processing ids:  80%|████████  | 8/10 [00:24<00:05,  2.85s/it]

Failed to download PDF for ID 134555: status code 404

Processing ID 40233: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=40233
https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/40233/VanBanGoc_44_2012_TT-BGTVT.pdf


processing ids:  90%|█████████ | 9/10 [00:27<00:02,  2.83s/it]

Failed to download PDF for ID 40233: status code 404

Processing ID 14568: https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=14568
https://vbpl.vn/FileData/TW/Lists/vbpq/Attachments/14568/VanBanGoc_21_2006_QĐ-BTNMT.pdf


processing ids: 100%|██████████| 10/10 [00:30<00:00,  3.00s/it]

Failed to download PDF for ID 14568: status code 404

Summary:
IDs with no PDF found: 2
Timed out IDs: 0
Download failures: 6





In [None]:
for id in tqdm(all_ids[85:], desc="Downloading PDFs"):
    full_url = f"{f_url}ItemID={id}"
    # Download the HTML of the page
    response = requests.get(full_url)
    if full_url != response.url: # no pdf -> skip
        continue
    html = response.text

    # Parse the HTML
    soup = BeautifulSoup(html, 'html.parser')

    # Find all links that use "downloadfile"
    links = soup.find_all('a', href=re.compile("downloadfile\\("))
    pdf_link = None

    # Iterate to find the link whose first parameter ends with .pdf
    for a in links:
        js_href = a['href']
        match = re.search(r"downloadfile\('([^']+)','([^']+)'\)", js_href)
        if match:
            filename = match.group(1)  # file name
            if filename.lower().endswith('.pdf'):
                pdf_link = match
                break
            
    if pdf_link:
        pdf_filename = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\pdf\pdf_{id}.pdf"
        pdf_path = pdf_link.group(2)
        pdf_web_url = urljoin(full_url, pdf_path)
        # Download the PDF
        pdf_response = requests.get(pdf_web_url)
        if pdf_response.status_code == 200:
            with open(pdf_filename, "wb") as f:
                f.write(pdf_response.content)
        else:
            print("Error downloading PDF. Code:", pdf_response.status_code)
    else:
        continue  # no pdf -> skip


Downloading PDFs:   0%|          | 0/5871 [00:00<?, ?it/s]

Downloading PDFs:   6%|▌         | 363/5871 [37:24<5:31:03,  3.61s/it] 

Error downloading PDF. Code: 404


Downloading PDFs:   8%|▊         | 465/5871 [46:16<6:57:50,  4.64s/it] 