In [None]:
! pip install langchain_community tiktoken langchain_text_splitters faiss-cpu qdrant-client protonx dotenv langchain-google-genai pyvi rank_bm25 numpy sentence-transformers ipywidgets

# I. Setup

In [3]:
import os
import json
import sys
import time
import re
from typing import List

import tiktoken
from dotenv import load_dotenv
from langchain.retrievers import EnsembleRetriever
from langchain.storage import InMemoryStore
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
from protonx import ProtonX
from pyvi import ViTokenizer
from tqdm import tqdm

from IPython.display import Markdown, display

load_dotenv()

True

In [2]:
! langchain -v

langchain-cli 0.0.37


# II. Load Documents

In [4]:
# Enhanced document loading with metadata preservation - TESTING VERSION (50 docs only)

def load_legal_docs_from_folders(root_dir: str, max_docs: int = 50) -> list[Document]:
    """
    Enhanced loader that preserves structured metadata from folders containing
    both 'metadata.json' and 'content.txt' files.

    Args:
        root_dir: Directory to load documents from
        max_docs: Maximum number of documents to load (for testing)
    """
    documents = []
    doc_count = 0

    print(f"\n[INFO] Starting enhanced loading from '{root_dir}'...")

    for dirpath, _, filenames in os.walk(root_dir):
        if doc_count >= max_docs:
            break

        if "metadata.json" in filenames and "content.txt" in filenames:
            metadata_path = os.path.join(dirpath, "metadata.json")
            content_path = os.path.join(dirpath, "content.txt")

            try:
                with open(metadata_path, 'r', encoding='utf-8') as f:
                    metadata_json = json.load(f)
                with open(content_path, 'r', encoding='utf-8') as f:
                    full_text = f.read()

                # Enhanced page content with title
                page_content = (
                    f"Tiêu đề: {metadata_json.get('title', '')}\n\n"
                    f"Toàn văn: {full_text}"
                )

                # Extract specific metadata fields from the diagram section
                id_metadata = metadata_json.get("metadata", {})
                diagram_metadata = metadata_json.get("metadata", {}).get("diagram", {})

                # Create final metadata with only the required fields
                final_metadata = {
                    '_id': id_metadata.get('_id', ''),
                    'so_hieu': diagram_metadata.get('so_hieu', ''),
                    'loai_van_ban': diagram_metadata.get('loai_van_ban', ''),
                    'linh_vuc_nganh': diagram_metadata.get('linh_vuc_nganh', ''),
                    'noi_ban_hanh': diagram_metadata.get('noi_ban_hanh', ''),
                    'nguoi_ky': diagram_metadata.get('nguoi_ky', ''),
                    'ngay_ban_hanh': diagram_metadata.get('ngay_ban_hanh', ''),
                    'ngay_hieu_luc': diagram_metadata.get('ngay_hieu_luc', ''),
                    'tinh_trang': diagram_metadata.get('tinh_trang', ''),
                    'title': metadata_json.get('title', ''),
                    'source': content_path,
                    'doc_id': doc_count + 1  # Add document ID for testing
                }

                doc = Document(page_content=page_content, metadata=final_metadata)
                documents.append(doc)
                doc_count += 1

                # Dynamic progress indicator
                progress_message = f"  -> Progress: {doc_count}/{max_docs} documents loaded."
                sys.stdout.write('\r' + progress_message)
                sys.stdout.flush()

            except Exception as e:
                # Print error on a new line to avoid being overwritten by progress
                print(f"\n[ERROR] Failed to process document in {dirpath}: {e}")

    # Print a newline to move past the progress indicator line
    print()
    return documents

# --- SCRIPT EXECUTION ---

# TESTING CONFIGURATION - Load only a few documents
root_dir = '../data/raw_data/documents'
MAX_DOCS_FOR_TESTING = 500

print("=" * 70)
print("   STARTING DOCUMENT LOADING SCRIPT (TESTING MODE)")
print(f"[INFO] Source Directory: {root_dir}")
print(f"[INFO] Document Limit: {MAX_DOCS_FOR_TESTING}")
print("=" * 70)

try:
    docs = load_legal_docs_from_folders(root_dir, max_docs=MAX_DOCS_FOR_TESTING)
    print(f"[SUCCESS] Enhanced loading complete. Found {len(docs)} structured documents.")
    if docs:
        print("\n[DEBUG] Sample metadata from the first document:")
        # Pretty print the JSON metadata for readability
        print(json.dumps(docs[0].metadata, indent=2, ensure_ascii=False))

except Exception as e:
    print(f"\n[ERROR] Enhanced loading process failed: {e}")
    print("[INFO] Attempting fallback to simple directory loading...")

    try:
        # Fallback to simple loading with limit
        loader = DirectoryLoader(
            root_dir,
            glob='**/*.txt',
            loader_cls=TextLoader,
            show_progress=True # Langchain's built-in progress bar
        )
        all_docs = loader.load()
        docs = all_docs[:MAX_DOCS_FOR_TESTING]
        print(f'[SUCCESS] Simple loading complete. Loaded {len(docs)} documents (capped at {MAX_DOCS_FOR_TESTING}).')
    except Exception as fallback_e:
        print(f"[ERROR] Fallback loading also failed: {fallback_e}")
        docs = [] # Ensure docs is an empty list on total failure

print("\n" + "=" * 70)
print("   FINAL SUMMARY")
if docs:
    print(f"  -> Total documents loaded: {len(docs)}")
    print("  -> Status: Ready for next processing steps.")
    print("\nPreview of first document content:")
    print("-" * 40)
    print(f"{docs[0].page_content[:300]}...")
    print("-" * 40)
else:
    print("  -> No documents were loaded. Please check the source directory and error logs.")
print("=" * 70)

   STARTING DOCUMENT LOADING SCRIPT (TESTING MODE)
[INFO] Source Directory: ../data/raw_data/documents
[INFO] Document Limit: 500

[INFO] Starting enhanced loading from '../data/raw_data/documents'...
  -> Progress: 500/500 documents loaded.
[SUCCESS] Enhanced loading complete. Found 500 structured documents.

[DEBUG] Sample metadata from the first document:
{
  "_id": "66b9c0963ab9c4ae3d5fb5b4",
  "so_hieu": "02/CT-UBND",
  "loai_van_ban": "Chỉ thị",
  "linh_vuc_nganh": "Giáo dục",
  "noi_ban_hanh": "Tỉnh Bắc Kạn",
  "nguoi_ky": "Phạm Duy Hưng",
  "ngay_ban_hanh": "24/05/2024",
  "ngay_hieu_luc": "24/05/2024",
  "tinh_trang": "Còn hiệu lực",
  "title": "Chỉ thị 02/CT-UBND tăng cường chỉ đạo, phối hợp tổ chức Kỳ thi tốt nghiệp trung học phổ thông và tuyển sinh đại học, cao đẳng năm 2024 do tỉnh Bắc Kạn ban hành",
  "source": "../data/raw_data/documents\\Chỉ thị 02CT-UBND tăng cường chỉ đạo, phối hợp tổ chức Kỳ thi tốt nghiệp trung học phổ thông và tuyể\\content.txt",
  "doc_id": 1
}

 

# III. Document Splitting (Parent-Child Architecture)

In [5]:
# Enhanced Intelligent Chunking for Legal Documents
tokenizer = tiktoken.get_encoding("cl100k_base")

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [6]:
class IntelligentLegalChunker:
    """
    Advanced chunking strategy for legal documents that:
    1. Respects token limits (max 3500 tokens to stay under 4096 limit)
    2. Preserves paragraph boundaries
    3. Handles oversized paragraphs intelligently
    4. Maintains context between chunks
    """

    def __init__(self, max_tokens=3500, overlap_tokens=200):
        self.max_tokens = max_tokens
        self.overlap_tokens = overlap_tokens
        self.tokenizer = tokenizer

    def split_paragraph_intelligently(self, paragraph, max_tokens=None):
        """Split a single paragraph that's too large"""
        if max_tokens is None:
            max_tokens = self.max_tokens

        # Try to split on sentences first
        sentences = re.split(r'(?<=[.!?])\s+', paragraph)
        if len(sentences) <= 1:
            # If no sentence breaks, split on words
            words = paragraph.split()
            chunks = []
            current_chunk = []
            current_tokens = 0

            for word in words:
                word_tokens = self.tokenizer.encode(word, disallowed_special=())
                if current_tokens + len(word_tokens) > max_tokens and current_chunk:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = [word]
                    current_tokens = len(word_tokens)
                else:
                    current_chunk.append(word)
                    current_tokens += len(word_tokens)

            if current_chunk:
                chunks.append(' '.join(current_chunk))
            return chunks

        # Split by sentences
        chunks = []
        current_chunk = []
        current_tokens = 0

        for sentence in sentences:
            sentence_tokens = self.tokenizer.encode(sentence, disallowed_special=())
            if current_tokens + len(sentence_tokens) > max_tokens and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_tokens = len(sentence_tokens)
            else:
                current_chunk.append(sentence)
                current_tokens += len(sentence_tokens)

        if current_chunk:
            chunks.append(' '.join(current_chunk))
        return chunks

    def chunk_document(self, document):
        """Chunk a document intelligently"""
        content = document.page_content
        metadata = document.metadata.copy()

        # Split by double newlines (paragraphs)
        paragraphs = content.split('\n\n')
        chunks = []
        current_chunk = []
        current_tokens = 0

        for paragraph in paragraphs:
            paragraph = paragraph.strip()
            if not paragraph:
                continue

            paragraph_tokens = self.tokenizer.encode(paragraph, disallowed_special=())

            # If single paragraph is too large, split it
            if len(paragraph_tokens) > self.max_tokens:
                # Add current chunk if it exists
                if current_chunk:
                    chunks.append('\n\n'.join(current_chunk))
                    current_chunk = []
                    current_tokens = 0

                # Split the oversized paragraph
                sub_chunks = self.split_paragraph_intelligently(paragraph)
                for sub_chunk in sub_chunks:
                    chunks.append(sub_chunk)
                continue

            # Check if adding this paragraph would exceed limit
            if current_tokens + len(paragraph_tokens) > self.max_tokens and current_chunk:
                # Save current chunk
                chunks.append('\n\n'.join(current_chunk))

                # Start new chunk with overlap if possible
                if len(current_chunk) > 0:
                    # Try to include last paragraph for overlap
                    overlap_text = current_chunk[-1]
                    overlap_tokens = self.tokenizer.encode(overlap_text, disallowed_special=())
                    if len(overlap_tokens) <= self.overlap_tokens:
                        current_chunk = [overlap_text, paragraph]
                        current_tokens = len(overlap_tokens) + len(paragraph_tokens)
                    else:
                        current_chunk = [paragraph]
                        current_tokens = len(paragraph_tokens)
                else:
                    current_chunk = [paragraph]
                    current_tokens = len(paragraph_tokens)
            else:
                current_chunk.append(paragraph)
                current_tokens += len(paragraph_tokens)

        # Add final chunk
        if current_chunk:
            chunks.append('\n\n'.join(current_chunk))

        # Create Document objects for each chunk
        chunk_docs = []
        for i, chunk_content in enumerate(chunks):
            chunk_metadata = metadata.copy()
            chunk_metadata['chunk_id'] = i
            chunk_metadata['total_chunks'] = len(chunks)
            chunk_docs.append(Document(page_content=chunk_content, metadata=chunk_metadata))

        return chunk_docs

print("Intelligent Legal Chunker initialized successfully!")
print("Features:")
print("- Respects 3500 token limit (safe for 4096 API limit)")
print("- Preserves paragraph boundaries")
print("- Handles oversized paragraphs intelligently")
print("- Maintains context with smart overlap")


Intelligent Legal Chunker initialized successfully!
Features:
- Respects 3500 token limit (safe for 4096 API limit)
- Preserves paragraph boundaries
- Handles oversized paragraphs intelligently
- Maintains context with smart overlap


In [7]:
# Enhanced Parent-Child Architecture with Intelligent Chunking
print("Setting up enhanced parent-child document architecture with intelligent chunking...")

# Initialize the intelligent chunker
intelligent_chunker = IntelligentLegalChunker(max_tokens=3500, overlap_tokens=200)

# Create parent chunks and store them
docstore = InMemoryStore()
child_chunks = []

print(f"Processing {len(docs)} documents with intelligent chunking...")

for doc_idx, doc in enumerate(docs):
    print(f"Processing document {doc_idx + 1}/{len(docs)}: {doc.metadata.get('title', 'Unknown')[:50]}...")

    # Use intelligent chunking for parent chunks
    parent_chunks = intelligent_chunker.chunk_document(doc)

    for i, parent_chunk in enumerate(parent_chunks):
        parent_id = f"{doc.metadata.get('title', 'doc')}-{i}"
        docstore.mset([(parent_id, parent_chunk)])

        # For child chunks, use smaller intelligent chunking
        child_chunker = IntelligentLegalChunker(max_tokens=800, overlap_tokens=100)
        sub_chunks = child_chunker.chunk_document(parent_chunk)

        for sub_chunk in sub_chunks:
            sub_chunk.metadata['parent_id'] = parent_id
            # Preserve original metadata
            sub_chunk.metadata.update(doc.metadata)
            child_chunks.append(sub_chunk)

print(f"[INFO] Created {len(child_chunks)} child chunks from {len(docs)} documents.")
print(f"[INFO] Parent chunks stored in docstore for context retrieval.")

# Verify chunk sizes
chunk_sizes = [tiktoken_len(chunk.page_content) for chunk in child_chunks]
print(f"[INFO] Chunk size statistics:")
print(f"   - Min tokens: {min(chunk_sizes)}")
print(f"   - Max tokens: {max(chunk_sizes)}")
print(f"   - Avg tokens: {sum(chunk_sizes)//len(chunk_sizes)}")
print(f"   - Chunks over 3500 tokens: {sum(1 for size in chunk_sizes if size > 3500)}")

if max(chunk_sizes) > 3500:
    print(f"[WARNING] Some chunks still exceed 3500 tokens.")
    print("   This might cause API errors. Consider reducing max_tokens further.")
else:
    print("[SUCCESS] All chunks are within safe token limits.")

Setting up enhanced parent-child document architecture with intelligent chunking...
Processing 500 documents with intelligent chunking...
Processing document 1/500: Chỉ thị 02/CT-UBND tăng cường chỉ đạo, phối hợp tổ...
Processing document 2/500: Chỉ thị 03/CT-CTUBND tổ chức Kỳ thi tốt nghiệp tru...
Processing document 3/500: Chỉ thị 03/CT-UBND năm 2024 tăng cường công tác ...
Processing document 4/500: Chỉ thị 03/CT-UBND tăng cường phối hợp tổ chức Kỳ ...
Processing document 5/500: Chỉ thị 03/CT-UBND tổ chức Kỳ thi tốt nghiệp trung...
Processing document 6/500: Chỉ thị 04/CT-UBND tăng cường chỉ đạo, phối hợp tổ...
Processing document 7/500: Chỉ thị 04/CT-UBND về chỉ đạo, phối hợp tổ chức kỳ...
Processing document 8/500: Chỉ thị 04/CT-UBND về tăng cường chỉ đạo, phối hợp...
Processing document 9/500: Chỉ thị 04/CT-UBND về tăng cường chỉ đạo, phối hợp...
Processing document 10/500: Chỉ thị 04/CT-UBND về tổ chức các kỳ thi trên địa ...
Processing document 11/500: Chỉ thị 05/CT-UBND năm 

# IV. Embedding Generation

In [8]:
# Enhanced Sentence-Transformers Embeddings with Token Limit Handling
from sentence_transformers import SentenceTransformer

class EnhancedSentenceTransformerEmbeddings(Embeddings):
    """
    Enhanced Sentence-Transformers embeddings class that handles token limit errors gracefully
    and provides dynamic batching with a progress bar.
    Features:
    - Uses a specified Sentence-Transformer model
    - Handles potential token limit issues (though Sentence-Transformers models have varying limits)
    - Dynamic, token-aware batching for efficient processing
    - Internal progress bar for user feedback
    """
    def __init__(self, model_name: str = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', batch_size: int = 32):
        """Initializes the SentenceTransformer model and sets processing parameters."""
        # Using a multilingual model that generally performs well on various languages, including Vietnamese
        self.model = SentenceTransformer(model_name)
        self.batch_size = batch_size # Batch size for SentenceTransformer

        # Attempt to get max sequence length, default if not available
        try:
            self.max_seq_length = self.model.max_seq_length
        except AttributeError:
             # Default to a common max length if model doesn't expose it
            self.max_seq_length = 512
        print(f"Enhanced SentenceTransformerEmbeddings initialized with model '{model_name}'.")
        print(f"Max sequence length (approx): {self.max_seq_length}")


    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Generates embeddings for a list of documents using dynamic batching and a progress bar.
        Sentence-Transformers handles token limits internally to some extent,
        but large documents might still be an issue depending on the model.
        We rely on the model's built-in batching and progress bar via encode().
        """
        # The SentenceTransformer.encode method handles batching and GPU usage internally
        # if a GPU is available. It also has a show_progress_bar argument.
        print(f"[INFO] Generating embeddings in batches of {self.batch_size}...")
        embeddings = self.model.encode(texts,
                                       batch_size=self.batch_size,
                                       show_progress_bar=True,
                                       convert_to_list=True)
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        """Generates an embedding for a single query text."""
        # SentenceTransformer.encode can also handle single texts
        embedding = self.model.encode(text, convert_to_list=True)
        return embedding

print("Enhanced SentenceTransformerEmbeddings class created.")
print("Ready to use a Sentence-Transformer model for embeddings.")

Enhanced SentenceTransformerEmbeddings class created.
Ready to use a Sentence-Transformer model for embeddings.


In [9]:
# 1. Initialize the enhanced embeddings class using Sentence-Transformers

# Use the new Sentence-Transformer embeddings class
# You can specify a different model name if needed
embeddings = EnhancedSentenceTransformerEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

print("Enhanced Sentence-Transformer embeddings object created and ready to use.")

# 2. Extract texts and metadatas from child chunks
texts = [chunk.page_content for chunk in child_chunks]
metadatas = [chunk.metadata for chunk in child_chunks]

# --- START OF PROCESS ---
print("\n" + "=" * 70)
print("   STARTING EMBEDDING GENERATION PROCESS (Sentence-Transformers)")
print("=" * 70)
print(f"[INFO] Number of text chunks to process: {len(texts)}")
print("-" * 70)

start_time = time.time()

# 3. Process all embeddings with a single call using the new class.
# The SentenceTransformer.encode method handles batching and progress.
text_embeddings = embeddings.embed_documents(texts)

end_time = time.time()
total_time = end_time - start_time

# --- FINAL SUMMARY REPORT ---
print("\n" + "=" * 70)
print("   EMBEDDING PROCESS COMPLETE: SUMMARY REPORT")
print("=" * 70)

# Check if text_embeddings is not empty and has the correct length
if len(text_embeddings) > 0 and len(text_embeddings) == len(texts):
    print(f"[SUCCESS] Process finished in {total_time:.2f} seconds ({total_time/60:.1f} minutes).")

    print("\n--- Performance Metrics ---")
    print(f"  -> {'Total chunks processed':<25}: {len(text_embeddings)}")
    # Calculate throughput based on actual successful embeddings
    if total_time > 0:
      print(f"  -> {'Throughput':<25}: {len(text_embeddings)/total_time:.1f} chunks/sec")
      print(f"  -> {'Avg. time per chunk':<25}: {(total_time/len(text_embeddings))*1000:.2f} ms")
    else:
        print("  -> Performance metrics not available (total time is zero).")


    print("\n--- Embedding Details ---")
    if len(text_embeddings) > 0:
        print(f"  -> {'Embedding dimension':<25}: {len(text_embeddings[0])}")
    else:
        print("  -> No embeddings were generated.")

else:
    print("[ERROR] Embedding generation failed or was incomplete. Please check the error logs above.")

print("=" * 70)

Enhanced SentenceTransformerEmbeddings initialized with model 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'.
Max sequence length (approx): 128
Enhanced Sentence-Transformer embeddings object created and ready to use.

   STARTING EMBEDDING GENERATION PROCESS (Sentence-Transformers)
[INFO] Number of text chunks to process: 16309
----------------------------------------------------------------------
[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/510 [00:00<?, ?it/s]


   EMBEDDING PROCESS COMPLETE: SUMMARY REPORT
[SUCCESS] Process finished in 105.20 seconds (1.8 minutes).

--- Performance Metrics ---
  -> Total chunks processed   : 16309
  -> Throughput               : 155.0 chunks/sec
  -> Avg. time per chunk      : 6.45 ms

--- Embedding Details ---
  -> Embedding dimension      : 768


# V. Vector Store and Retriever Setup

In [10]:
# Setup FAISS vector store
print("\nCreating FAISS vector store...")
# FAISS requires (text, embedding) pairs
text_embedding_pairs = list(zip(texts, text_embeddings))
vectorstore_local = FAISS.from_embeddings(text_embedding_pairs, embeddings, metadatas=metadatas)
print("FAISS vector store created successfully.")

# Create and persist Qdrant vector store
print("\nCreating and persisting Qdrant vector store...")
from langchain_community.vectorstores import Qdrant
qdrant_store = Qdrant.from_documents(
    child_chunks,
    embeddings,
    location=":memory:",  # Use in-memory storage
    collection_name="vietjusticia_legal_docs",
)
print("Qdrant vector store created in-memory.")
print("Collection name: 'vietjusticia_legal_docs'")

# Setup BM25 retriever with Vietnamese tokenization
print("\nSetting up BM25 retriever with Vietnamese tokenization...")

def tokenize_vi_for_bm25(doc):
    """Tokenize Vietnamese text for better BM25 matching"""
    doc.page_content = ViTokenizer.tokenize(doc.page_content)
    return doc

# Create tokenized copies for BM25
child_chunks_tokenized = [tokenize_vi_for_bm25(chunk) for chunk in child_chunks]
bm25_retriever = BM25Retriever.from_documents(child_chunks_tokenized)
bm25_retriever.k = 15  # Retrieve top 15 for ensemble
print("BM25 retriever with Vietnamese tokenization created successfully.")


Creating FAISS vector store...
FAISS vector store created successfully.

Creating and persisting Qdrant vector store...
[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Generating embeddings in batches of 32...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Qdrant vector store created in-memory.
Collection name: 'vietjusticia_legal_docs'

Setting up BM25 retriever with Vietnamese tokenization...
BM25 retriever with Vietnamese tokenization created successfully.


In [11]:
# Setup Ensemble Retrieval (BM25 + Qdrant)
print("\nSetting up ensemble retrieval...")

# Create Qdrant retriever
qdrant_retriever = qdrant_store.as_retriever(search_kwargs={'k': 15})

# Create ensemble retriever combining BM25 and Qdrant
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, qdrant_retriever],
    weights=[0.6, 0.4]  # Favor BM25 for Vietnamese lexical matching
)

print("Ensemble retriever (BM25 + Qdrant) created successfully.")

# Setup Parent Document Retrieval Chain
def _get_parent_docs(input_dict: dict) -> list[Document]:
    """Retrieve parent documents from child documents"""
    child_docs = input_dict["child_docs"]
    store = input_dict["docstore"]
    parent_ids = []
    for doc in child_docs:
        if "parent_id" in doc.metadata and doc.metadata["parent_id"] not in parent_ids:
            parent_ids.append(doc.metadata["parent_id"])
    return [doc for doc in store.mget(parent_ids) if doc is not None]

# Create the parent retrieval chain
parent_retriever_chain = (
    {"child_docs": ensemble_retriever}
    | RunnablePassthrough.assign(docstore=lambda x: docstore)
    | RunnableLambda(_get_parent_docs)
)

print("Parent document retrieval chain created successfully.")


Setting up ensemble retrieval...
Ensemble retriever (BM25 + Qdrant) created successfully.
Parent document retrieval chain created successfully.


# VI. RAG Chain Setup and Testing

In [12]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0)

# Enhanced prompt template with Vietnamese instructions and citation requirements
template = """Answer the question based ONLY on the following context.
Your answer must be in Vietnamese.
Your answer should be well-structured and easy to read.
- Use bullet points or numbered lists for multiple items or steps.
- Use **bold** for key terms, names, or important numbers and concepts.
- Use *italics* for emphasis or to highlight specific terms.

**After providing the answer, you MUST cite your sources accurately using the metadata from the context.**
For each source used, provide its title and document number (Số hiệu) if available.

Context:
{context}

Question: {question}

If the context does not provide enough information, say "Tôi không tìm thấy thông tin trong tài liệu được cung cấp." and do not provide an answer."""

prompt = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    """Format retrieved documents with metadata for better context"""
    formatted_docs = []
    for i, doc in enumerate(docs):
        # Add document metadata as header
        title = doc.metadata.get('title', f'Document {i+1}')
        source = doc.metadata.get('source', 'Unknown source')
        formatted_doc = f"--- Document: {title} ---\n{doc.page_content}\n--- End Document ---"
        formatted_docs.append(formatted_doc)
    return "\n\n".join(formatted_docs)

# Enhanced RAG chain using parent document retrieval
rag_chain = (
    {"context": parent_retriever_chain | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("Enhanced RAG chain with hybrid retrieval and parent documents created successfully.")

Enhanced RAG chain with hybrid retrieval and parent documents created successfully.


In [13]:
query = "thời gian học tập năm 2024-2025 tỉnh bắc kạn là từ khi nào?"
print(f"Query: {query}\n")
print("=" * 80)

response = rag_chain.invoke(query)
# print(f"Response: {response}\n")
display(Markdown(f"**Response:** {response}"))

Query: thời gian học tập năm 2024-2025 tỉnh bắc kạn là từ khi nào?



**Response:** Dựa trên thông tin được cung cấp, kế hoạch thời gian năm học 2024-2025 đối với giáo dục mầm non, giáo dục phổ thông và giáo dục thường xuyên tại tỉnh Bắc Kạn như sau:

*   **Ngày tựu trường**:
    *   Các cấp học tựu trường sớm nhất vào **ngày 29 tháng 8 năm 2024** (Thứ Năm), trước 01 tuần so với ngày khai giảng.
    *   Riêng đối với lớp 1, tựu trường sớm nhất vào **ngày 22 tháng 8 năm 2024** (Thứ Năm), trước 02 tuần so với ngày khai giảng.
*   **Ngày khai giảng**: **Ngày 05 tháng 9 năm 2024** (Thứ Năm).
*   **Thời gian học kỳ**:
    *   Kết thúc học kỳ I: Trước **ngày 18 tháng 01 năm 2025**.
    *   Hoàn thành chương trình và kết thúc năm học: Trước **ngày 31 tháng 5 năm 2025**.
*   **Tổng thời gian thực học**: Kế hoạch năm học phải đảm bảo đủ **35 tuần** thực học, cụ thể:
    *   Học kỳ I: **18 tuần**.
    *   Học kỳ II: **17 tuần**.

***

### Nguồn tài liệu:
*   **Tiêu đề**: Quyết định 1370/QĐ-UBND năm 2024 về Kế hoạch thời gian năm học 2024-2025 đối với giáo dục mầm non, giáo dục phổ thông và giáo dục thường xuyên tỉnh Bắc Kạn
*   **Số hiệu**: 1370/QĐ-UBND

In [14]:
query = "Kỳ thi chọn học sinh giỏi cấp tỉnh lớp 12 được tổ chức vào ngày nào?"
print(f"Query: {query}\n")
print("=" * 80)

response = rag_chain.invoke(query)
# print(f"Response: {response}\n")
display(Markdown(f"**Response:** {response}"))

Query: Kỳ thi chọn học sinh giỏi cấp tỉnh lớp 12 được tổ chức vào ngày nào?



**Response:** Dựa trên các văn bản được cung cấp, thời gian tổ chức **Kỳ thi chọn học sinh giỏi cấp tỉnh lớp 12** năm học 2024-2025 khác nhau tùy theo từng địa phương, cụ thể như sau:

*   **Tỉnh Bình Định**: Tổ chức vào ngày **22/10/2024**.
*   **Tỉnh Đắk Nông**: Tổ chức từ ngày **25 tháng 3 năm 2025** đến ngày **26 tháng 3 năm 2025**.
*   **Tỉnh Lâm Đồng**: Hoàn thành tổ chức thi *trước ngày* **31/01/2025**.
*   **Tỉnh Bắc Kạn**: Hoàn thành tổ chức thi *trước ngày* **28 tháng 4 năm 2025** (áp dụng cho cả cấp THCS và THPT).
*   **Tỉnh Sóc Trăng**: Thời gian tổ chức *theo Kế hoạch của Sở Giáo dục và Đào tạo*.

***

### **Nguồn:**
1.  **Quyết định 2834/QĐ-UBND** năm 2024 về Kế hoạch thời gian năm học 2024-2025 đối với giáo dục mầm non, giáo dục phổ thông và giáo dục thường xuyên trên địa bàn tỉnh Bình Định.
2.  **Quyết định 944/QĐ-UBND** năm 2024 về Kế hoạch thời gian năm học 2024-2025 đối với giáo dục mầm non, giáo dục phổ thông và giáo dục thường xuyên do tỉnh Đắk Nông ban hành.
3.  **Quyết định 1287/QĐ-UBND** năm 2024 về Kế hoạch thời gian năm học 2024-2025 đối với giáo dục mầm non, giáo dục phổ thông và giáo dục thường xuyên trên địa bàn tỉnh Lâm Đồng.
4.  **Quyết định 1370/QĐ-UBND** năm 2024 về Kế hoạch thời gian năm học 2024-2025 đối với giáo dục mầm non, giáo dục phổ thông và giáo dục thường xuyên tỉnh Bắc Kạn.
5.  **Quyết định 1828/QĐ-UBND** năm 2024 về Kế hoạch thời gian năm học 2024-2025 đối với giáo dục mầm non, giáo dục phổ thông và giáo dục thường xuyên trên địa bàn tỉnh Sóc Trăng.

In [15]:
query = "Dựa trên hồ sơ do Công ty TNHH OTES CORPORATION cung cấp, hãy mô tả các đặc tính kỹ thuật chính của sản phẩm TamSoil Polynite ECO ở trạng thái dung dịch sau khi pha, và nêu rõ căn cứ pháp lý cao nhất (Luật) mà Cục Hải quan đã dựa vào để ban hành thông báo mã số cho sản phẩm này."
print(f"Query: {query}\n")
print("=" * 80)

response = rag_chain.invoke(query)
print(f"Response: {response}\n")

# Display with markdown formatting
display(Markdown(f"**Response:** {response}"))

Query: Dựa trên hồ sơ do Công ty TNHH OTES CORPORATION cung cấp, hãy mô tả các đặc tính kỹ thuật chính của sản phẩm TamSoil Polynite ECO ở trạng thái dung dịch sau khi pha, và nêu rõ căn cứ pháp lý cao nhất (Luật) mà Cục Hải quan đã dựa vào để ban hành thông báo mã số cho sản phẩm này.

Response: Tôi không tìm thấy thông tin trong tài liệu được cung cấp.



**Response:** Tôi không tìm thấy thông tin trong tài liệu được cung cấp.

# VII. Performance Comparison and Analysis

## Enhanced Features Summary

### ✅ **What's New in This Enhanced Version:**

1. **🔍 Hybrid Retrieval**: BM25 (lexical) + Qdrant (semantic) ensemble
2. **🇻🇳 Vietnamese Tokenization**: Better BM25 matching for Vietnamese text
3. **📚 Parent-Child Architecture**: Better context preservation
4. **📄 Enhanced Data Loading**: Preserves structured metadata
5. **🎯 Improved Prompting**: Vietnamese instructions with citation requirements
6. **⚡ Sentence-Transformers Embeddings**: High-quality multilingual embeddings

### **Architecture Flow:**
```
Query → [BM25 + Qdrant] → Ensemble → Parent-Child Hydration → Enhanced RAG → Response
```

### **Key Improvements Over Original:**
- **Better Recall**: Hybrid retrieval catches both exact matches and semantic similarity
- **Better Context**: Parent-child architecture provides more complete context
- **Vietnamese-Optimized**: Tokenization and multilingual embeddings
- **Production-Ready**: Robust error handling and metadata preservation
- **Citation Support**: Structured metadata for accurate source attribution
- **Scalable Vector Store**: Qdrant provides better performance and scalability than FAISS