# 1. Chu·∫©n b·ªã th∆∞ vi·ªán

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
import numpy as np
import faiss
import json
from torch.nn.utils.rnn import pad_sequence
import random
import math
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:

print("Phi√™n b·∫£n PyTorch:", torch.__version__)
print("Phi√™n b·∫£n CUDA m√† PyTorch s·ª≠ d·ª•ng:", torch.version.cuda)
print("CUDA kh·∫£ d·ª•ng:", torch.cuda.is_available())

Phi√™n b·∫£n PyTorch: 2.5.1
Phi√™n b·∫£n CUDA m√† PyTorch s·ª≠ d·ª•ng: 12.1
CUDA kh·∫£ d·ª•ng: True


# 2. √Åp d·ª•ng k·ªπ thu·∫≠t DAPT
X√¢y d·ª±ng th√™m 1 block MLM hu·∫•n luy·ªán l·∫°i qua b·ªô d·ªØ li·ªáu ƒë·∫∑c th√π ƒë√£ chu·∫©n b·ªã tr∆∞·ªõc

In [None]:
class RawDataset(Dataset):
    def __init__(self, lines, tokenizer, max_length=256, mlm_prob=0.15):
        self.lines = lines
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mlm_prob = mlm_prob

    def __len__(self):
        return len(self.lines)

    def mask_tokens(self, inputs, special_tokens_mask):
        labels = inputs.clone()
        probability_matrix = torch.full(labels.shape, self.mlm_prob)
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)

        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100

        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        return inputs, labels

    def __getitem__(self, idx):
        line = self.lines[idx]
        encoding = self.tokenizer(
            line,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_special_tokens_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        special_tokens_mask = encoding['special_tokens_mask'].squeeze(0).bool()
        input_ids, labels = self.mask_tokens(input_ids.clone(), special_tokens_mask)
        attention_mask = encoding['attention_mask'].squeeze(0)
        token_type_ids = torch.zeros(self.max_length, dtype=torch.long)  # PhoBERT kh√¥ng d√πng token_type_ids nh∆∞ng model y√™u c·∫ßu

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
            "token_type_ids": token_type_ids,
        }

class EarlyStopping:
    def __init__(self, patience=2, verbose=True, save_path="/content/MyDrive/models/my_model"):
        self.patience = patience
        self.verbose = verbose
        self.best_loss = float("inf")
        self.counter = 0
        self.save_path = save_path

    def __call__(self, val_loss, model, tokenizer):
        if val_loss < self.best_loss:
            self.best_loss = val_loss
            self.counter = 0
            if self.verbose:
                print(f"‚Üí Val loss improved, saving model to {self.save_path}")
            self.save_checkpoint(model, tokenizer)
        else:
            self.counter += 1
            if self.verbose:
                print(f"‚Üí No improvement. EarlyStopping counter: {self.counter}/{self.patience}")
        return self.counter >= self.patience

    def save_checkpoint(self, model, tokenizer):
        model.save_pretrained(self.save_path)
        tokenizer.save_pretrained(self.save_path)


# --- 2. Load d·ªØ li·ªáu ---
with open("data/legal_text.txt", "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

split_idx = int(0.8 * len(lines))
train_lines = lines[:split_idx]
val_lines = lines[split_idx:]

# --- 3. Kh·ªüi t·∫°o tokenizer, model, dataset, dataloader ---
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForMaskedLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_dataset = RawDataset(train_lines, tokenizer)
val_dataset = RawDataset(val_lines, tokenizer)

batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


# --- 4. H√†m train 1 epoch ---
def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, desc="Training")

    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            token_type_ids=token_type_ids,
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    return avg_loss


# --- 5. H√†m validation ---
def validate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                token_type_ids=token_type_ids,
            )
            loss = outputs.loss
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    perplexity = math.exp(avg_loss)
    return avg_loss, perplexity


# --- 6. V√≤ng l·∫∑p hu·∫•n luy·ªán ch√≠nh ---
early_stopping = EarlyStopping(patience=2, verbose=True, save_path="models/my_model")

num_epochs = 5

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    train_loss = train_one_epoch(model, train_loader, optimizer, device)
    print(f"Train Loss: {train_loss:.4f}")

    val_loss, val_perplexity = validate(model, val_loader, device)
    print(f"Validation Loss: {val_loss:.4f}, Perplexity: {val_perplexity:.2f}")

    if early_stopping(val_loss, model, tokenizer):
        print("Early stopping triggered. Training stopped.")
        break




Ki·ªÉm tra t√°c v·ª• mask

In [None]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

print(fill_mask("C√¥ng d√¢n c√≥ quy·ªÅn <mask> t·∫°i n∆°i c∆∞ tr√∫."))

Device set to use cuda:0


[{'score': 0.21961596608161926, 'token': 235, 'token_str': 's·ªëng', 'sequence': 'C√¥ng d√¢n c√≥ quy·ªÅn s·ªëng t·∫°i n∆°i c∆∞ tr√∫.'}, {'score': 0.21799877285957336, 'token': 25, 'token_str': '·ªü', 'sequence': 'C√¥ng d√¢n c√≥ quy·ªÅn ·ªü t·∫°i n∆°i c∆∞ tr√∫.'}, {'score': 0.15344639122486115, 'token': 385, 'token_str': 't·ª±', 'sequence': 'C√¥ng d√¢n c√≥ quy·ªÅn t·ª± t·∫°i n∆°i c∆∞ tr√∫.'}, {'score': 0.10251910239458084, 'token': 1354, 'token_str': 'tr√∫', 'sequence': 'C√¥ng d√¢n c√≥ quy·ªÅn tr√∫ t·∫°i n∆°i c∆∞ tr√∫.'}, {'score': 0.032492998987436295, 'token': 5157, 'token_str': 'c∆∞_tr√∫', 'sequence': 'C√¥ng d√¢n c√≥ quy·ªÅn c∆∞_tr√∫ t·∫°i n∆°i c∆∞ tr√∫.'}]


In [None]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

text = "Ng∆∞·ªùi ch∆∞a <mask> mu·ªën <mask> th∆∞·ªùng tr√∫ ph·∫£i c√≥ s·ª± <mask> c·ªßa cha m·∫π ho·∫∑c ng∆∞·ªùi gi√°m h·ªô."

results = fill_mask(text)

# In ra t·ª´ng d√≤ng d·ª± ƒëo√°n cho m·ªói v·ªã tr√≠ <mask>
for i, mask_predictions in enumerate(results):
    print(f"\nD·ª± ƒëo√°n cho <mask> th·ª© {i+1}:")
    for pred in mask_predictions:
        token = pred["token_str"]
        score = pred["score"]
        print(f"  {token:<15} (score: {score:.4f})")


Device set to use cuda:0



D·ª± ƒëo√°n cho <mask> th·ª© 1:
  th√†nh_ni√™n      (score: 0.7145)
  th√†nh           (score: 0.1371)
  ch·ªìng           (score: 0.0640)
  ch·∫øt            (score: 0.0205)
  v·ª£              (score: 0.0075)

D·ª± ƒëo√°n cho <mask> th·ª© 2:
  ƒë·∫øn             (score: 0.2873)
  ƒëƒÉng            (score: 0.2173)
  k√Ω              (score: 0.1525)
  ƒëi              (score: 0.0890)
  chuy·ªÉn          (score: 0.0396)

D·ª± ƒëo√°n cho <mask> th·ª© 3:
  ƒë·ªìng_√Ω          (score: 0.5017)
  cho_ph√©p        (score: 0.0787)
  ch·∫•p_thu·∫≠n      (score: 0.0570)
  ki·∫øn            (score: 0.0317)
  ph√©p            (score: 0.0211)


# 2. Tr√≠ch xu·∫•t vƒÉn b·∫£n lu·∫≠t li√™n quan
Cho m√¥ h√¨nh ƒë√£ DAPT th·ª±c hi·ªán t√¨m ki·∫øm c√°c ƒëo·∫°n vƒÉn b·∫£n ƒêi·ªÅu lu·∫≠t li√™n quan t·ªõi c√¢u h·ªèi nh·∫•t

## 2.1. Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu

### a) D·ªØ li·ªáu c√¢u h·ªèi

In [1]:
def preprocess_qa(input_path="data/qa.csv", output_path="data/clean_qa.csv"):
    """
    X·ª≠ l√Ω ƒë∆°n gi·∫£n d·ªØ li·ªáu c√¢u h·ªèi: ch·ªâ chuy·ªÉn v·ªÅ lowercase.
    """
    df = pd.read_csv(input_path)

    # Chuy·ªÉn c√¢u h·ªèi v·ªÅ ch·ªØ th∆∞·ªùng
    df["clean_question"] = df["question"].astype(str).str.strip().str.lower()

    # B·ªè d√≤ng c√≥ n·ªôi dung tr·ªëng
    df = df[df["clean_question"].str.len() > 0]

    df.to_csv(output_path, index=False)
    print(f"‚úÖ ƒê√£ l∆∞u {len(df)} c√¢u h·ªèi v√†o {output_path}")

preprocess_qa()

NameError: name 'pd' is not defined

### b) D·ªØ li·ªáu vƒÉn b·∫£n ph√°p lu·∫≠t

In [None]:
def preprocess_law(input_path="data/legal_data.csv", output_path="data/law_articles.csv"):
    """
    X·ª≠ l√Ω ƒë∆°n gi·∫£n d·ªØ li·ªáu ƒêi·ªÅu lu·∫≠t: g·ªôp c√°c tr∆∞·ªùng n·ªôi dung v√† chuy·ªÉn v·ªÅ lowercase.
    """
    df = pd.read_csv(input_path)

    # G·ªôp: ƒêi·ªÅu + T√™n ƒëi·ªÅu + N·ªôi dung
    def merge(row):
        parts = [
            "ƒêi·ªÅu",
            str(row["ƒëi·ªÅu"]).strip() if pd.notna(row["ƒëi·ªÅu"]) else "",
            str(row["t√™n ƒëi·ªÅu"]).strip() if pd.notna(row["t√™n ƒëi·ªÅu"]) else "",
            str(row["n·ªôi dung"]).strip() if pd.notna(row["n·ªôi dung"]) else "",
        ]
        return f"{parts[0]} {parts[1]}. {parts[2]}. {parts[3]}".strip()

    df["content"] = df.apply(merge, axis=1)

    # L∆∞u c·ªôt c·∫ßn thi·∫øt
    df_out = df[["id", "content"]].copy()
    df_out.to_csv(output_path, index=False)

    print(f"‚úÖ ƒê√£ l∆∞u {len(df_out)} ƒêi·ªÅu lu·∫≠t v√†o {output_path}")

preprocess_law()

‚úÖ ƒê√£ l∆∞u 90 ƒêi·ªÅu lu·∫≠t v√†o /content/drive/MyDrive/data/law_articles.csv


## 2.2. Truy xu·∫•t top-k ƒëo·∫°n vƒÉn b·∫£n li√™n quan

Load m√¥ h√¨nh ƒë√£ DAPT

In [None]:
def load_model(model_path="models/my_model"):
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
    model = AutoModel.from_pretrained(model_path)
    model.eval()
    return tokenizer, model

tokenizer, model = load_model("models/my_model")

Some weights of RobertaModel were not initialized from the model checkpoint at models/my_model and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


H√†m encode

In [None]:
def encode_text_attention(texts, tokenizer, model, device='cuda', max_length=256):
    model.to(device)
    model.eval()

    inputs = tokenizer(
        texts,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_length
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden = outputs.last_hidden_state  # (batch_size, seq_len, hidden_dim)
        attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden.size()).float()  # (batch, seq_len, hidden_dim)

        weighted_sum = (last_hidden * attention_mask).sum(dim=1)  # sum token embeddings weighted by mask
        valid_token_count = attention_mask.sum(dim=1)  # (batch_size, hidden_dim)
        embeddings = weighted_sum / valid_token_count  # mean pooling with mask

    return embeddings.cpu().numpy()  # (batch_size, hidden_dim)

def encode_text_cls(text, tokenizer, model):
    text = text.lower().strip()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze().numpy()

X√¢y d·ª±ng FAISS index

In [None]:
def build_faiss_index_attention(texts, tokenizer, model, batch_size=32, device='cuda'):
    """
    X√¢y d·ª±ng FAISS index t·ª´ c√°c ƒëo·∫°n vƒÉn b·∫£n
    """
    vectors = []
    for i in tqdm(range(0, len(texts), batch_size), desc="üîß Encoding ƒëi·ªÅu lu·∫≠t"):
        batch = texts[i:i + batch_size]
        embeddings = encode_text_attention(batch, tokenizer, model, device=device)
        vectors.append(embeddings)
    vectors = np.concatenate(vectors, axis=0).astype("float32")
    faiss.normalize_L2(vectors)
    dim = vectors.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(vectors)
    return index, vectors

def build_faiss_index_cls(texts, tokenizer, model):
    vectors = []
    for t in tqdm(texts, desc="üîß Encoding ƒëi·ªÅu lu·∫≠t"):
        vectors.append(encode_text_cls(t, tokenizer, model))
    vectors = np.stack(vectors).astype("float32")
    faiss.normalize_L2(vectors)
    dim = vectors.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(vectors)
    return index, vectors

Truy xu·∫•t top-k

In [None]:
def retrieve_top_k_attention(index, law_texts, questions, tokenizer, model, k):
    results = []
    for q in tqdm(questions, desc="üîç Truy xu·∫•t c√¢u h·ªèi"):
        q_vec = encode_text_attention(q, tokenizer, model).astype("float32").reshape(1, -1)
        faiss.normalize_L2(q_vec)
        D, I = index.search(q_vec, k)
        matched = [law_texts[i] for i in I[0]]
        scores = D[0].tolist()
        results.append({
            "question": q,
            "top_k_laws": matched,
            "scores": scores
        })
    return results

def retrieve_top_k_cls(index, law_texts, questions, tokenizer, model, k):
    results = []
    for q in tqdm(questions, desc="üîç Truy xu·∫•t c√¢u h·ªèi"):
        q_vec = encode_text_cls(q, tokenizer, model).astype("float32").reshape(1, -1)
        faiss.normalize_L2(q_vec)
        D, I = index.search(q_vec, k)
        matched = [law_texts[i] for i in I[0]]
        scores = D[0].tolist()
        results.append({
            "question": q,
            "top_k_laws": matched,
            "scores": scores
        })
    return results


Rerank tr√™n c√¢u tr·∫£ l·ªùi

In [None]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
def rerank(question, top_k_laws, cross_encoder, top_n):
    # T·∫°o danh s√°ch c√°c c·∫∑p (query, document)
    pairs = [(question, law) for law in top_k_laws]

    # D·ª± ƒëo√°n ƒëi·ªÉm t∆∞∆°ng ƒë·ªìng cho t·ª´ng c·∫∑p
    scores = cross_encoder.predict(pairs)

    scores = [float(score) for score in scores]

    # S·∫Øp x·∫øp theo ƒëi·ªÉm gi·∫£m d·∫ßn
    sorted_pairs = sorted(zip(top_k_laws, scores), key=lambda x: x[1], reverse=True)

    # Tr·∫£ v·ªÅ top_n k·∫øt qu·∫£
    reranked_laws = [law for law, _ in sorted_pairs[:top_n]]
    reranked_scores = [score for _, score in sorted_pairs[:top_n]]

    return reranked_laws, reranked_scores


L∆∞u vector

In [None]:
df_law = pd.read_csv("data/law_articles.csv")
df_qa = pd.read_csv("data/clean_qa.csv")

law_texts = df_law["content"].tolist()
questions = df_qa["clean_question"].astype(str).tolist()

print("Load xong")

index, _ = build_faiss_index_attention(law_texts, tokenizer, model)


Load xong


üîß Encoding ƒëi·ªÅu lu·∫≠t: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:01<00:00,  2.92it/s]


Pipeline ch√≠nh

In [None]:
def pipeline_retrieve_and_rerank(question, index, law_texts, tokenizer, model, cross_encoder, top_k=30, top_n=5):
    results = retrieve_top_k_attention(index, law_texts, [question], tokenizer, model, k=top_k)
    top_k_laws = results[0]["top_k_laws"]
    reranked_laws, reranked_scores = rerank(question, top_k_laws, cross_encoder, top_n=top_n)
    return reranked_laws, reranked_scores

Ch·∫°y ki·∫øm tra 1 c√¢u h·ªèi ƒë·∫ßu v√†o

In [None]:
question = "T√¥i ƒëi Singapore 10 ng√†y c√≥ ph·∫£i khai b√°o t·∫°m v·∫Øng kh√¥ng?"

top5_laws, top5_scores = pipeline_retrieve_and_rerank(
    question,
    index,
    law_texts,
    tokenizer,
    model,
    cross_encoder,
    top_k=30,
    top_n=5
)

print(f"C√¢u h·ªèi: {question}")
print("Top 5 ƒëo·∫°n lu·∫≠t ƒë√£ rerank:")
for i, (law, score) in enumerate(zip(top5_laws, top5_scores), 1):
    print(f"{i}. (ƒêi·ªÉm: {score:.4f}) {law[:300]}{'...' if len(law) > 300 else ''}")

üîç Truy xu·∫•t c√¢u h·ªèi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 35.25it/s]


C√¢u h·ªèi: T√¥i ƒëi Singapore 10 ng√†y c√≥ ph·∫£i khai b√°o t·∫°m v·∫Øng kh√¥ng?
Top 5 ƒëo·∫°n lu·∫≠t ƒë√£ rerank:
1. (ƒêi·ªÉm: 3.5183) ƒêi·ªÅu 4. N∆°i c∆∞ tr√∫ c·ªßa ng∆∞·ªùi kh√¥ng c√≥ n∆°i th∆∞·ªùng tr√∫, n∆°i t·∫°m tr√∫. 1. Ng∆∞·ªùi kh√¥ng c√≥ n∆°i th∆∞·ªùng tr√∫, n∆°i t·∫°m tr√∫ ph·∫£i khai b√°o ngay th√¥ng tin v·ªÅ c∆∞ tr√∫ v·ªõi c∆° quan ƒëƒÉng k√Ω c∆∞ tr√∫ t·∫°i n∆°i ·ªü hi·ªán t·∫°i. Tr∆∞·ªùng h·ª£p qua ki·ªÉm tra, r√† so√°t, c∆° quan ƒëƒÉng k√Ω c∆∞ tr√∫ ph√°t hi·ªán ng∆∞·ªùi thu·ªôc tr∆∞·ªùng h·ª£p ph·∫£i khai b...
2. (ƒêi·ªÉm: 3.1157) ƒêi·ªÅu 4. N∆°i c∆∞ tr√∫ c·ªßa ng∆∞·ªùi kh√¥ng c√≥ n∆°i th∆∞·ªùng tr√∫, n∆°i t·∫°m tr√∫. 1. Ng∆∞·ªùi kh√¥ng c√≥ n∆°i th∆∞·ªùng tr√∫, n∆°i t·∫°m tr√∫ khai b√°o th√¥ng tin v·ªÅ c∆∞ tr√∫ theo m·∫´u T·ªù khai thay ƒë·ªïi th√¥ng tin c∆∞ tr√∫ v√† n·ªôp tr·ª±c tuy·∫øn, tr·ª±c ti·∫øp ho·∫∑c qua d·ªãch v·ª• b∆∞u ch√≠nh c√¥ng √≠ch ƒë·∫øn c∆° quan ƒëƒÉng k√Ω c∆∞ tr√∫ t·∫°i n∆°i ·ªü hi·ªán t·∫°i theo...
3. (ƒêi·ªÉm: 1.9634) ƒêi·ªÅu 19. N∆°i c∆∞ tr√∫ c·ªßa

# 3. M√¥ h√¨nh sinh c√¢u tr·∫£ l·ªùi

In [None]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()
genai.configure(api_key=os.getenv("API_KEY"))

generative_model = genai.GenerativeModel("gemini-1.5-flash")  # ho·∫∑c "gemini-1.5-pro"

question = "Th·ªß t·ª•c ƒëƒÉng k√Ω t·∫°m tr√∫ l√† g√¨? ƒê·ªëi t∆∞·ª£ng n√†o ph·∫£i ƒëƒÉng k√Ω t·∫°m tr√∫?"

top5_laws, top5_scores = pipeline_retrieve_and_rerank(
    question,
    index,
    law_texts,
    tokenizer,
    model,
    cross_encoder,
    top_k=30,
    top_n=5
)

law_context = "\n".join(f"- {law}" for law in top5_laws)

prompt = f"""
B·∫°n l√† tr·ª£ l√Ω ·∫£o ph√°p l√Ω c·ªßa ch√≠nh quy·ªÅn Vi·ªát Nam. H√£y tr·∫£ l·ªùi c√¢u h·ªèi c·ªßa ng∆∞·ªùi d√¢n b·∫±ng c√°ch d·ª±a v√†o c√°c vƒÉn b·∫£n ph√°p lu·∫≠t sau:

{law_context}

‚ùì C√¢u h·ªèi: {question}

üí¨ Tr·∫£ l·ªùi (ng·∫Øn g·ªçn, r√µ r√†ng, theo lu·∫≠t): 
"""

response = generative_model.generate_content(prompt)

# In ra c√¢u h·ªèi, c√¢u tr·∫£ l·ªùi v√† c√°c vƒÉn b·∫£n ph√°p lu·∫≠t li√™n quan ra 1 file reponse.txt
with open("response.txt", "w", encoding="utf-8") as f:
    f.write("üîç C√¢u h·ªèi: " + question + "\n")
    f.write("üí¨ Tr·∫£ l·ªùi: " + response.text + "\n")
    f.write("üìú C√°c vƒÉn b·∫£n ph√°p lu·∫≠t li√™n quan:\n")
    f.write(law_context + "\n")


üîç Truy xu·∫•t c√¢u h·ªèi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 38.10it/s]
