In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
import re
import requests
from tqdm import tqdm

- Your code will go over each index html file in the BoPhapDienDienTu/demuc directory, and open and save the html documents linked from the index page, and save them to the BoPhapDienDienTu/vbpl directory in the form full_ItemID.html. 
	- For example for the document at "https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID=148955&Keyword=", your code will save it as full_148955.html
- Also save the property page for each document in the BoPhapDienDienTu/property directory in the form p_ItemID.html. 
	- For example, for the same document above, the property page is at "https://vbpl.vn/tw/Pages/vbpq-thuoctinh.aspx?dvid=13&ItemID=148955&Keyword=", your code will save it as p_148955.html
- Also save the history page for each document in BoPhapDienDienTu/history. 
	- For example, for the same document above, the history page is at "https://vbpl.vn/tw/Pages/vbpq-lichsu.aspx?dvid=13&ItemID=148955&Keyword=", your code will save it as h_148955.html
- Also save the related document page for each document in BoPhapDienDienTu/related. 
	- For example, for the same document above, the related document page is at https://vbpl.vn/TW/Pages/vbpq-vanbanlienquan.aspx?ItemID=148955&Keyword=, your code will save it as r_148955.html
- Also save the PDF for each document in BoPhapDienDienTu/pdf. 
	- For example, for the same document above, the PDF file is at https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID=148955, your code will save it as pdf_148955.pdf

In [None]:
file_paths = []
folder_path = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\demuc"

for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        # Construct full file path
        html_file_path = os.path.join(root, file_name)
        if html_file_path.endswith(".html"):
            file_paths.append(html_file_path)
print(len(file_paths))

all_links = []            
for html_file_path in tqdm(file_paths, desc="Processing HTML files"):
    with open(html_file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all <a> tags
    anchors = soup.find_all('a')

    # Extract the hyperlinks from each <a> tag
    http_hrefs = [
        anchor.get('href') 
        for anchor in anchors 
        if anchor.get('href') and anchor.get('href').startswith("http") and ("ItemID=" in anchor.get('href'))
    ]
    
    # Use regex to filter and deduplicate links
    deduped_links = list(set([
        re.sub(r"(&.*$)|(#.*)?$", "", link)
        for link in http_hrefs
    ]))
    all_links.extend(deduped_links)

In [None]:
all_links = list(set(all_links))
print(len(all_links))

# Save the links to a text file
with open("vbpl_links.txt", "w", encoding="utf-8") as file:
	for link in all_links:
		file.write(link + "\n")

5956


### Retrieve HTML documents

In [2]:
driver = webdriver.Chrome()
driver.maximize_window()
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")

In [3]:
f_url = "https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?"
property_url = "https://vbpl.vn/tw/Pages/vbpq-thuoctinh.aspx?dvid=13&"
history_url = "https://vbpl.vn/tw/Pages/vbpq-lichsu.aspx?dvid=13&"
related_url = "https://vbpl.vn/TW/Pages/vbpq-vanbanlienquan.aspx?"
pdf_url = "https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?"

In [None]:
# Read the links from the file and extract IDs
with open("vbpl_links.txt", "r", encoding="utf-8") as file:
    all_links = file.read().splitlines()
    all_ids = [link.split("ItemID=")[1] for link in all_links]

def retrieval_tool(url, option, id_list=all_ids, output_url="p"):
    timed_out_ids = []       # deal with network errors
    different_url_ids = []	 # deal with redirections
    for id in tqdm(id_list[2676+145:], desc=f"Retrieving {option} HTML"):
        full_url = f"{url}ItemID={id}"
        
        # Check for redirection using requests
        try:
            response = requests.get(full_url, allow_redirects=True, timeout=10)
        except Exception as e:
            print(f"Error fetching {full_url}: {e}")
            timed_out_ids.append(id)
            continue
        
        # If the final URL after redirection is different, skip this ID
        if response.url != full_url:
            different_url_ids.append(id)
            continue
        
        # If no redirect is detected, proceed with Selenium scraping
        driver.get(full_url)
        time.sleep(random.uniform(0.5, 1))
        html_content = driver.page_source
        
        # Save the HTML content to a file
        file_path = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\{option}\{output_url}_{id}.html"
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(html_content)
            
    return timed_out_ids, different_url_ids

p_time_out_ids = retrieval_tool(url=property_url, option="property", output_url="p")[0]
print(len(p_time_out_ids))
# retrieval_tool(history_url, "history", "h")[0]
# retrieval_tool(related_url, "related", "r")[0]
# retrieval_tool(pdf_url, "PDF","pdf")[0]

Retrieving property HTML:   4%|▍         | 145/3280 [08:49<3:10:58,  3.66s/it]


KeyboardInterrupt: 

After scraping around 700 pages, I realized that sometimes, the URLs to the full page redirects to a different page (the 'van-ban-goc' or PDF page instead). I deleted these instances, since the full page did not seem to exist.

In [5]:
wrong = []
with open("vbpl_links.txt", "r", encoding="utf-8") as file:
    all_links = file.read().splitlines()
    all_ids = [link.split("ItemID=")[1] for link in all_links]

# Loop through the IDs and process each
for id in tqdm(all_ids[:694], desc="Retrieving HTML documents"):
    full_url = f"{f_url}ItemID={id}"
    
    # Check for redirection using requests
    try:
        response = requests.get(full_url, allow_redirects=True, timeout=10)
    except Exception as e:
        print(f"Error fetching {full_url}: {e}")
        continue
    
    if response.url != full_url:
        wrong.append(rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_{id}.html")

print(len(wrong))
    
for file_path in wrong:
    try:
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        else:
            print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error deleting {file_path}: {e}")


Retrieving HTML documents:  66%|██████▋   | 460/694 [19:32<19:10,  4.92s/it]

Error fetching https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID=106063: HTTPSConnectionPool(host='vbpl.vn', port=443): Read timed out. (read timeout=10)


Retrieving HTML documents:  66%|██████▋   | 461/694 [19:42<26:00,  6.70s/it]

Error fetching https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID=129642: HTTPSConnectionPool(host='vbpl.vn', port=443): Read timed out. (read timeout=10)


Retrieving HTML documents: 100%|██████████| 694/694 [29:20<00:00,  2.54s/it]

107
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_144824.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_134555.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_139899.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_119137.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_134205.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_151083.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_151826.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_136138.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_120916.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_133858.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_18142.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_110915.html
Deleted: D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_146048.html
Deleted: 




In [None]:
with open("vbpl_links.txt", "r", encoding="utf-8") as file:
    all_links = file.read().splitlines()
    all_ids = [link.split("ItemID=")[1] for link in all_links]

# Loop through the IDs and process each
time_out_ids = []
for id in tqdm(all_ids[2474:], desc="Retrieving HTML documents"):
    full_url = f"{f_url}ItemID={id}"
    
    # Check for redirection using requests
    try:
        response = requests.get(full_url, allow_redirects=True, timeout=10)
    except Exception as e:
        print(f"Error fetching {full_url}: {e}")
        time_out_ids.append(id)
        continue
    
    # If the final URL after redirection is different, skip this ID
    if response.url != full_url:
        print(f"Skipped: {full_url} redirected to {response.url}")
        continue
    
    # If no redirect is detected, proceed with Selenium scraping
    driver.get(full_url)
    time.sleep(random.uniform(0.5, 1))
    html_content = driver.page_source
    
    # Save the HTML content to a file
    file_path = rf"D:\PhapDien_semantic_search\BoPhapDienDienTu\vbpl\full_{id}.html"
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(html_content)