<a href="https://colab.research.google.com/github/nguyentrungdung-dev/PhapDien/blob/main/PhapDienDocument.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **I. Prepare documents**

# **1. Dowload file zip**

In [None]:
!pip install requests tqdm
#Just install library



# **2. Unzip the zip file**


In [None]:
import os
import zipfile

zip_path = "/content/drive/MyDrive/PhapDien_Data/BoPhapDienDienTu.zip"
extract_path = "/content/BoPhapDienDienTu"

#Extract ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extracted file!")


Extracted file!


# **3. Create the necessary folders.**

In [4]:
folders = ["vbpl", "property", "history", "related", "pdf"]
for folder in folders:
    os.makedirs(os.path.join(extract_path, folder), exist_ok=True)

print("Folders created!")


Folders created!


# **4. Get a list of documents to download**

In [5]:
from bs4 import BeautifulSoup
import glob

demuc_path = os.path.join(extract_path, "demuc")
html_files = glob.glob(os.path.join(demuc_path, "*.html"))

item_ids = set()

#Browse each index.html file to find the ItemID
for html_file in html_files:
    with open(html_file, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        links = soup.find_all("a", href=True)

        for link in links:
            if "ItemID=" in link["href"]:
                item_id = link["href"].split("ItemID=")[-1].split("&")[0]
                item_ids.add(item_id)

print(f"Found {len(item_ids)} documents to download. ")


Found 79238 documents to download. 


# **5. Download content of each document - test 1000 documents with 100 threads**

In [5]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

#Sample URL
url_templates = {
    "vbpl": "https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID={}",
    "property": "https://vbpl.vn/tw/Pages/vbpq-thuoctinh.aspx?dvid=13&ItemID={}",
    "history": "https://vbpl.vn/tw/Pages/vbpq-lichsu.aspx?dvid=13&ItemID={}",
    "related": "https://vbpl.vn/TW/Pages/vbpq-vanbanlienquan.aspx?ItemID={}",
    "pdf": "https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID={}"
}

#File download function
def download_file(url, save_path, retries=3):
    for _ in range(retries):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    f.write(response.content)
                return True
        except requests.exceptions.RequestException:
            pass
    return False

#Full document download function
def download_document(item_id):
    tasks = []
    for key, url_template in url_templates.items():
        url = url_template.format(item_id)
        save_path = os.path.join(extract_path, key, f"{key[0]}_{item_id}.html" if key != "pdf" else f"pdf_{item_id}.pdf")
        tasks.append((url, save_path))

    results = [download_file(url, save_path) for url, save_path in tasks]
    return item_id, all(results)

#Multithreaded
def download_all_documents(item_ids, num_threads=100):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_id = {executor.submit(download_document, item_id): item_id for item_id in item_ids}
        for future in as_completed(future_to_id):
            item_id = future_to_id[future]
            try:
                success = future.result()[1]
                print(f"✔ Item {item_id}: {'Success' if success else 'Error'}")
            except Exception as e:
                print(f"Error with ItemID {item_id}: {e}")

#Execute full document download
download_all_documents(list(item_ids)[:1000])


NameError: name 'item_ids' is not defined

# **II. Create Vector Database use ChromaDB and LangChain**

# **1. Install lybraries**

In [1]:
!pip install -U langchain langchain-community chromadb transformers sentence-transformers unstructured




# **2. Import libraries**

In [None]:
import os
import chromadb
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#Defines the directory containing the files to be ingested.
html_folder = "/content/BoPhapDienDienTu/vbpl/"

#Generate a list of valid files (only take files in the form full_ItemID.html)
file_paths = [os.path.join(html_folder, f) for f in os.listdir(html_folder) if f.startswith("full_") and f.endswith(".html")]
print(f"Total number of valid files: {len(file_paths)}")

#Initialize embedded model
embedding_model = HuggingFaceEmbeddings(model_name="bkai-foundation-models/vietnamese-bi-encoder")

#Connect or create a new Chroma database
chroma_path = "chroma_db"
vector_db = Chroma(persist_directory=chroma_path, embedding_function=embedding_model)

#Read and process each file
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=20)

for file_path in file_paths:
    try:
        #Read file contents
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        #Break text into small paragraphs
        texts = text_splitter.split_text(text)

        #Create metadata for files
        metadata = {"source": file_path}

        #Add to vector database
        vector_db.add_texts(texts, metadatas=[metadata] * len(texts))

        print(f"Ingested {file_path}")

    except Exception as e:
        print(f"Error with file {file_path}: {e}")

#Save ChromaDB
vector_db.persist()
print("Complete ingest of document into ChromaDB!!!")


Total number of valid files: 811


  embedding_model = HuggingFaceEmbeddings(model_name="bkai-foundation-models/vietnamese-bi-encoder")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.46k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

  vector_db = Chroma(persist_directory=chroma_path, embedding_function=embedding_model)


Ingested /content/BoPhapDienDienTu/vbpl/full_128550.html
Ingested /content/BoPhapDienDienTu/vbpl/full_144248.html
Ingested /content/BoPhapDienDienTu/vbpl/full_37200.html
Ingested /content/BoPhapDienDienTu/vbpl/full_46311.html
Ingested /content/BoPhapDienDienTu/vbpl/full_120219.html
Ingested /content/BoPhapDienDienTu/vbpl/full_111993.html
Ingested /content/BoPhapDienDienTu/vbpl/full_129070.html
Ingested /content/BoPhapDienDienTu/vbpl/full_140553.html


In [6]:
import os
import re

html_folder = "BoPhapDienDienTu/vbpl/"

for filename in os.listdir(html_folder):
    old_path = os.path.join(html_folder, filename)

    #Check if the file has the extension .html or .htm
    if os.path.isfile(old_path) and filename.lower().endswith(('.html', '.htm')):

        #Find the ID number in the file name (assuming there is a number in the name)
        match = re.search(r'\d+', filename)
        if match:
            item_id = match.group()
            new_filename = f"full_{item_id}.html"
            new_path = os.path.join(html_folder, new_filename)

            #Rename file
            os.rename(old_path, new_path)
            print(f"Rename files: {filename} → {new_filename}")
        else:
            print(f"Skip {filename} (ID not found)")


Rename files: v_24290#Dieu_1.html → full_24290.html
Rename files: v_103491#Chuong_III_Dieu_18.html → full_103491.html
Rename files: v_140528#Chuong_III_Dieu_19.html → full_140528.html
Rename files: v_155154#Dieu_4.html → full_155154.html
Rename files: v_46790#Chuong_I_Dieu_5.html → full_46790.html
Rename files: v_81024#Chuong_II_Muc_5_Dieu_17.html → full_81024.html
Rename files: v_113443#Chuong_I_Dieu_5.html → full_113443.html
Rename files: v_122388#Chuong_IV_Muc_1_Dieu_49.html → full_122388.html
Rename files: v_153609#Chuong_II_Dieu_20.html → full_153609.html
Rename files: v_28011#Dieu_4.html → full_28011.html
Rename files: v_118566#Chuong_I_Dieu_1.html → full_118566.html
Rename files: v_122183#Chuong_I_Dieu_1.html → full_122183.html
Rename files: v_25389#Chuong_III_Dieu_14.html → full_25389.html
Rename files: v_27143#Dieu_3.html → full_27143.html
Rename files: v_143024#Chuong_VI_Dieu_68.html → full_143024.html
Rename files: v_27268#Dieu_11.html → full_27268.html
Rename files: v_13965

In [7]:
import os
import re

#List of directories to process and corresponding prefixes
folders = {
    "BoPhapDienDienTu/property": "p_",
    "BoPhapDienDienTu/history": "h_",
    "BoPhapDienDienTu/related": "r_",
    "BoPhapDienDienTu/pdf": "pdf_"
}

#Process each folder
for folder, prefix in folders.items():
    if not os.path.exists(folder):
        print(f"Directory does not exist: {folder}")
        continue

    print(f"Processing folder: {folder}")

    for filename in os.listdir(folder):
        old_path = os.path.join(folder, filename)

        #Check if it is a .html or .pdf file (for PDF folder)
        if os.path.isfile(old_path) and filename.lower().endswith(('.html', '.htm', '.pdf')):

            #Find ID number in file name
            match = re.search(r'\d+', filename)
            if match:
                item_id = match.group()
                new_filename = f"{prefix}{item_id}.html" if folder != "BoPhapDienDienTu/pdf" else f"{prefix}{item_id}.pdf"
                new_path = os.path.join(folder, new_filename)

                #Rename
                os.rename(old_path, new_path)
                print(f"Rename: {filename} → {new_filename}")
            else:
                print(f"Skip {filename} (ID not found!)")

print("File renaming complete!")


Processing folder: BoPhapDienDienTu/property
Rename: p_26915#Chuong_II_Muc_2_Dieu_8.html → p_26915.html
Rename: p_126171#Chuong_VI_Muc_3_Dieu_55.html → p_126171.html
Rename: p_142815#Chuong_II_Dieu_4.html → p_142815.html
Rename: p_19423#Chuong_V_Dieu_38.html → p_19423.html
Rename: p_47365#Dieu_4.html → p_47365.html
Rename: p_142881#Chuong_I_Dieu_7.html → p_142881.html
Rename: p_32512#Dieu_8.html → p_32512.html
Rename: p_18565#Chuong_IV_Muc_1_Dieu_19.html → p_18565.html
Rename: p_121488#Chuong_VI_Muc_2_Dieu_92.html → p_121488.html
Rename: p_118307#Chuong_III_Dieu_25.html → p_118307.html
Rename: p_146652#Chuong_I_Dieu_3.html → p_146652.html
Rename: p_21233#Dieu_3.html → p_21233.html
Rename: p_33573#Chuong_V_Dieu_23.html → p_33573.html
Rename: p_96118#Chuong_I_Dieu_5.html → p_96118.html
Rename: p_136941#Chuong_IV_Dieu_31.html → p_136941.html
Rename: p_134065#Chuong_I_Dieu_4.html → p_134065.html
Rename: p_137245#Dieu_7.html → p_137245.html
Rename: p_25957#Dieu_1.html → p_25957.html
Rename: