# Process the dataset

## Open the dataset

In [1]:
import json


In [2]:
database_path = '../data/cleaned_vlsp2025_law_db.json'
dataset_path = "../data/VLSP 2025 - MLQA-TSR Data Release/train_data/vlsp_2025_train.json"

with open(database_path, "r") as f:
    database = json.load(f)

with open(dataset_path, "r") as f:
    dataset = json.load(f)

## Add the title to the text of the article

In [3]:
for record in database:
    record["text"] = record["article_title"].strip() + "\n" + record["text"].strip()

In [4]:
database[1]["text"]

'Đối tượng áp dụng\nQuy chuẩn này áp dụng đối với người tham gia giao thông trên mạng lưới đường bộ của Việt Nam và các tổ chức, cá nhân liên quan đến quản lý, đầu tư xây dựng, bảo vệ, bảo trì, khai thác, sử dụng kết cấu hạ tầng giao thông đường bộ.'

In [5]:
for data in dataset:
    if data["question_type"] != "Yes/No":
        print(data)
        break

{'id': 'train_1', 'image_id': 'train_1_3', 'question': 'Biển báo cấm xe khách trên 29 chỗ được áp dụng trong các khoảng thời gian nào? ', 'relevant_articles': [{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': '22'}], 'question_type': 'Multiple choice', 'choices': {'A': 'Từ 6:30 đến 8:00 và từ 16:30 đến 18:30; ngoài các khoảng thời gian này không được phép lưu thông.', 'B': 'Từ 6:30 đến 8:00 và từ 16:30 đến 18:30; ngoài các khoảng thời gian này được phép lưu thông.', 'C': 'Cấm lưu thông cả ngày.', 'D': 'D. Không cấm xe khách trên 29 chỗ lưu thông.'}, 'answer': 'B'}


# Dataset Preparation

Create a dataset that can produce these `(Anchor, Positive, Negative)` triplets. Each item in dataset consist of
1. An **image**
2. A corresponding **text query**
3. A **positive document** (the correct answer/context for the image and query)
4. A **negative document** (a random, incorrect document from the database)

## Initialize the text embedding model

In [6]:
from src.multimodal_retriever.retriever_v2 import RetrieverV2
import torch
from sentence_transformers import SentenceTransformer

TEXT_MODEL_NAME = "BAAI/bge-m3"
DEVICE = "cuda:1" if torch.cuda.is_available() else "cpu"
# model = SentenceTransformer(TEXT_MODEL_NAME, device=DEVICE)
model = RetrieverV2(pretrained_model_path="../src/multimodal_retriever/pretrained_model/Visualized_m3.pth").to("cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from tqdm import tqdm

for record in tqdm(database, desc="Generating embeddings for the database"):
    # pt_embedding = model.encode_document(record["text"], convert_to_tensor=True, device=DEVICE)
    pt_embedding = model.encode_text(record["text"]).squeeze(0)
    np_embedding = pt_embedding.detach().cpu().numpy()
    list_embedding = np_embedding.tolist()
    record["embedding"] = list_embedding


Generating embeddings for the database:   3%|▎         | 11/399 [00:27<10:12,  1.58s/it]

In [8]:
import random
dataset_image_base_path = "../data/VLSP 2025 - MLQA-TSR Data Release/train_data/images/"

new_dataset = []
max_pairs = 100
for data in dataset:
    image_file = data["image_id"] + ".jpg"
    query_text = ""
    if data["question_type"] == "Multiple choice":
        query_text = data["question"]+ "\n"
        for key, value in data["choices"].items():
            query_text += f"{key}: {value}\n"
    elif data["question_type"] == "Yes/No":
        query_text = data["question"] + "\n"
    relevant_articles = data["relevant_articles"]
    for article in relevant_articles:
        positive_article_id = article
        
        # Get the embedding for the positive article
        positive_embedding = []
        positive_article = ""
        positive_record_id = ""
        for record in database:
            if record["law_id"] == positive_article_id["law_id"] and record["article_id"] == positive_article_id["article_id"]:
                positive_embedding = record["embedding"]
                positive_article = record["text"]
                positive_record_id = record["record_id"]
                break
        if positive_record_id == "":
            print(article)
            continue
        
        # Get random negative articles
        negative_articles = []
        for record in database:
            if record["law_id"] != positive_article_id["law_id"] or record["article_id"] != positive_article_id["article_id"]:
                negative_articles.append(record)
        negative_articles = random.sample(negative_articles, max_pairs)
        # negative_embeddings = [record["embedding"] for record in negative_articles]
        # negative_texts = [record["text"] for record in negative_articles]
        for negative_article in negative_articles:
            negative_text = negative_article["text"]
            negative_embedding = negative_article["embedding"]
            negative_record_id = negative_article["record_id"]
            new_dataset.append({
                "image_file": image_file,
                "query_text": query_text,
                "positive_record_id": positive_record_id,
                "negative_record_id": negative_record_id,
            })
        # negative_article = {
        #     "text": negative_texts,
        #     "embedding": negative_embeddings
        # }
        # new_dataset.append({
        #     "image_file": image_file,
        #     "query_text": query_text,
        #     "positive_article": {
        #         "text": positive_article,
        #         "embedding": positive_embedding
        #     },
        #     "negative_articles": negative_article
        # })




{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'B.3'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': '47.15'}
{'law_id': '36/2024/QH15', 'article_id': '26.0'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'G1.2'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'G1.1'}
{'law_id': '36/2024/QH15', 'article_id': '16.0'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'B.3; 41'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'D.11'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': '47.7'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': '47.22'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'D.11'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'D.11'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'F.10; 22'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'B.3; 41'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'B.3'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'B.3; 41'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'B.3'}
{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'B.3'}
{'law_id': 

In [None]:
print(new_dataset[200])

In [10]:
print(f"New dataset size: {len(new_dataset)}")

New dataset size: 118300


In [11]:
print(f"Dataset size: {len(dataset)}")

Dataset size: 530


In [12]:
cnt = 0
processed_new_dataset = []
for record in new_dataset:
    if record["positive_record_id"] != "" and record["negative_record_id"] != "":
        processed_new_dataset.append(record)

new_dataset = processed_new_dataset

In [27]:
# new_dataset_path = "../data/processed_vlsp_2025_multimodal_rag_dataset.json"
new_dataset_path = "../data/processed_vlsp_2025_multimodal_rag_dataset_120000.json"
with open(new_dataset_path, "w") as f:
    json.dump(new_dataset, f, indent=4, ensure_ascii=False)
print(f"New dataset saved to {new_dataset_path}")
print("Dataset preparation completed.")

New dataset saved to ../data/processed_vlsp_2025_multimodal_rag_dataset_only_title.json
Dataset preparation completed.


In [13]:
# record_id_to_embedding = {}
record_id_to_document_embedding = {}
for record in database:
    record_id_to_document_embedding[record["record_id"]] = {
        "text": record["text"],
        "embedding": record["embedding"]
    }


In [14]:
text_embedding_path = "../data/record_id_to_document_embedding_v2.json"
with open(text_embedding_path, "w") as f:
    json.dump(record_id_to_document_embedding, f, indent=4, ensure_ascii=False)
print(f"Text embeddings and text saved to {text_embedding_path}")

Text embeddings and text saved to ../data/record_id_to_document_embedding.json


# Create record id to document text dictionary

In [2]:
import json
record_id_to_embedding_path = "../data/record_id_to_document_embedding.json"
with open(record_id_to_embedding_path, "r") as f:
    record_id_to_embedding = json.load(f)

In [6]:
document_text_to_embedding = {value["text"]: value["embedding"] for key, value in record_id_to_embedding.items()}

In [10]:
document_text_to_embedding_path = "../data/document_text_to_embedding.json"
with open(document_text_to_embedding_path, "w") as f:
    json.dump(document_text_to_embedding, f, indent=4)