<a href="https://colab.research.google.com/github/nguyentrungdung-dev/PhapDien/blob/main/PhapDienDocument.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests tqdm




In [2]:
import os
import zipfile

zip_path = "/content/drive/MyDrive/PhapDien_Data/BoPhapDienDienTu.zip"
extract_path = "/content/BoPhapDienDienTu"

#Extract ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extracted file!")


Extracted file!


In [3]:
folders = ["vbpl", "property", "history", "related", "pdf"]
for folder in folders:
    os.makedirs(os.path.join(extract_path, folder), exist_ok=True)

print("Folders created!")


Folders created!


In [4]:
from bs4 import BeautifulSoup
import glob

demuc_path = os.path.join(extract_path, "demuc")
html_files = glob.glob(os.path.join(demuc_path, "*.html"))

item_ids = set()

#Browse each index.html file to find the ItemID
for html_file in html_files:
    with open(html_file, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        links = soup.find_all("a", href=True)

        for link in links:
            if "ItemID=" in link["href"]:
                item_id = link["href"].split("ItemID=")[-1].split("&")[0]
                item_ids.add(item_id)

print(f"Found {len(item_ids)} documents to download. ")


Found 79238 documents to download. 


In [7]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

#Sample URL
url_templates = {
    "vbpl": "https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID={}",
    "property": "https://vbpl.vn/tw/Pages/vbpq-thuoctinh.aspx?dvid=13&ItemID={}",
    "history": "https://vbpl.vn/tw/Pages/vbpq-lichsu.aspx?dvid=13&ItemID={}",
    "related": "https://vbpl.vn/TW/Pages/vbpq-vanbanlienquan.aspx?ItemID={}",
    "pdf": "https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID={}"
}

#File download function
def download_file(url, save_path, retries=3):
    for _ in range(retries):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    f.write(response.content)
                return True
        except requests.exceptions.RequestException:
            pass
    return False

#Full document download function
def download_document(item_id):
    tasks = []
    for key, url_template in url_templates.items():
        url = url_template.format(item_id)
        save_path = os.path.join(extract_path, key, f"{key[0]}_{item_id}.html" if key != "pdf" else f"pdf_{item_id}.pdf")
        tasks.append((url, save_path))

    results = [download_file(url, save_path) for url, save_path in tasks]
    return item_id, all(results)

#Multithreaded
def download_all_documents(item_ids, num_threads=100):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_id = {executor.submit(download_document, item_id): item_id for item_id in item_ids}
        for future in as_completed(future_to_id):
            item_id = future_to_id[future]
            try:
                success = future.result()[1]
                print(f"✔ Item {item_id}: {'Success' if success else 'Error'}")
            except Exception as e:
                print(f"Error with ItemID {item_id}: {e}")

#Execute full document download
download_all_documents(list(item_ids))


[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
✔ Item 118132#Chuong_I_Dieu_1: Success
✔ Item 127967#Dieu_3: Success
✔ Item 142881#Chuong_IX_Dieu_208: Success
✔ Item 137249#Dieu_5: Success
✔ Item 16742#Dieu_1: Success
✔ Item 155776#Chuong_II_Dieu_8: Success
✔ Item 67878#Dieu_11: Success
✔ Item 32918#Chuong_II_Dieu_11: Success
✔ Item 32645#Chuong_III_Dieu_26: Success
✔ Item 27732#Chuong_III_Dieu_16: Error
✔ Item 122250#Chuong_I_Dieu_4: Success
✔ Item 96172#Phan_hai_Chuong_XV_Dieu_211: Error
✔ Item 159924#Chuong_II_Dieu_5: Success
✔ Item 106223#Chuong_III_Dieu_9: Success
✔ Item 124150#Chuong_II_Dieu_32: Success
✔ Item 147119#Chuong_IV_Dieu_18: Success
✔ Item 12806#Chuong_II_Dieu_6: Success
✔ Item 70800#Chuong_XVII_Dieu_172: Success
✔ Item 38133#Dieu_7: Success
✔ Item 32635#Chuong_II_Muc_5_Dieu_35: Success
✔ Item 44168#Chuong_I_Dieu_16: Success
✔ Item 119138#Dieu_6: Success
✔ Item 96116#Chuong_IX_Muc_2_Dieu_59: Success
✔ Item 19714#Chuong_II_Dieu_18: Success
✔ Item 