# Load data

In [1]:
from docx import Document
import re

def read_docx(file_path):
    """Read a docx file and return the text"""
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)


def extract_epic_overview(text: str):
    """
    Extracts the Epic Overview section from the raw requirement document text.
    Returns None if no overview is found.
    """
    pattern = r"Epic Overview\s*(.+?)(?=\n(?:Requirements|PSE[0-9.]+|Narrative|Scope|Acceptance Criteria|Priority))"

    match = re.search(pattern, text, flags=re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return None


In [2]:
text = read_docx(r'D:\RAG_testcases\Epic 1 – User Management and Authentication.docx')
overview = extract_epic_overview(text)
print(overview)

This epic focuses on the development of a user management system that allows users to register, log in, manage their profiles, and access the application securely using email and password authentication. The system will ensure that users can create and maintain their accounts with ease while providing the necessary security features to protect user data.


# Chunking

In [3]:
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Define list heading in requirements
SECTION_HEADERS = [
    "Narrative",
    "Scope",
    "In Scope",
    "Out of Scope",
    "Acceptance Criteria",
    "Priority",
    "Business Rules",
    "Additional Comments"
]

In [5]:
def split_requirements(req_text: str):
    sections = []
    current_header = None
    current_lines = []

    for line in req_text.splitlines():
        stripped = line.strip()
        
        # Check if line matches any header
        header = next((h for h in SECTION_HEADERS if stripped.startswith(h)), None)
        
        if header:
            if current_header:
                sections.append((current_header, None, "\n".join(current_lines)))
            
            current_header = header
            current_lines = []
        else:
            current_lines.append(line)
            
    # flush final block
    if current_header:
        sections.append((current_header, None, "\n".join(current_lines)))

    return sections

def chunking_document(requirement_text: str, max_chunk_size: int = 800):
    """Chunk requirement document into hierarchical chunks (1-pass)."""
    chunks = []

    # Extract epic level
    lang = re.search(r"Language:\s*([a-zA-Z-]+)", requirement_text)
    epic_id = re.search(r"Epic ID:\s*([A-Za-z0-9]+)", requirement_text)

    epic_metadata = {
        "lang": lang.group(1) if lang else None,
        "epic_id": epic_id.group(1) if epic_id else None
    }

    # Detect requirement headers
    req_pattern = r"(PSE[0-9.]+)\s*–\s*(.+?)\n"
    matches = list(re.finditer(req_pattern, requirement_text))

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_size,
        chunk_overlap=50,
        separators=["\n\n", "\n", " ", ""]
    )

    for i, m in tqdm(
        enumerate(matches),
        total=len(matches),
        desc="Chunking requirement",
        ncols=100
    ):
        req_id = m.group(1).strip()
        req_name = m.group(2).strip()

        start = m.end()
        end = matches[i+1].start() if i+1 < len(matches) else len(requirement_text)
        req_block = requirement_text[start:end].strip()

        # Extract priority
        priority_match = re.search(r"Priority\s*\n([A-Z]+)", req_block)
        priority = priority_match.group(1).strip() if priority_match else None

        # Requirement-level metadata
        base_metadata = {
            **epic_metadata,
            "requirement_id": req_id,
            "requirement_name": req_name,
            "priority": priority
        }

        # Section-based parsing
        sections = split_requirements(req_block)

        for header, parent, content in sections:
            # Build metadata including optional parent_section
            metadata = {**base_metadata, "section": header}
            if parent:
                metadata["parent_section"] = parent

            # Skip empty sections
            if not content.strip():
                continue

            # Short enough → 1 chunk
            if len(content) <= max_chunk_size:
                chunks.append({
                    "text": f"{content.strip()}",
                    "metadata": metadata
                })
            else:
                # Too long → chunk using LangChain
                parts = splitter.split_text(content)
                for p in parts:
                    chunks.append({
                        "text": f"{p}",
                        "metadata": metadata
                    })

    return chunks

In [6]:
chunks = chunking_document(text)

Chunking requirement: 100%|███████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]


In [7]:
for i, chunk in enumerate(chunks):
    print(f"# Chunk number {i}:")
    print(chunk["text"])
    print("metadata: ", chunk["metadata"])
    print("-" * 80)
    if i == 10:
        break

# Chunk number 0:
As a new user
I want to register using my email and a password
So that I can create an account and access the ProShop application.
metadata:  {'lang': 'en', 'epic_id': 'EP1', 'requirement_id': 'PSE1.1', 'requirement_name': 'ACCOUNT REGISTRATION', 'priority': 'MUST', 'section': 'Narrative'}
--------------------------------------------------------------------------------
# Chunk number 1:
A registration form
Input validation
Validate email format and password security
Successful registration handling
metadata:  {'lang': 'en', 'epic_id': 'EP1', 'requirement_id': 'PSE1.1', 'requirement_name': 'ACCOUNT REGISTRATION', 'priority': 'MUST', 'section': 'In Scope'}
--------------------------------------------------------------------------------
# Chunk number 2:
Email verification
The app will not send a verification email to confirm the user’s email address
metadata:  {'lang': 'en', 'epic_id': 'EP1', 'requirement_id': 'PSE1.1', 'requirement_name': 'ACCOUNT REGISTRATION', 'prior

# Embedding

In [8]:
from langchain_classic.schema import Document

def convert_chunks_to_documents(chunks: list) -> list:
    documents = []
    for chunk in tqdm(chunks):
        documents.append(Document(page_content=chunk["text"], metadata=chunk["metadata"]))
    return documents 

documents = convert_chunks_to_documents(chunks)

100%|██████████| 32/32 [00:00<00:00, 32132.57it/s]


In [9]:
# checkout document
print("len of document", len(documents))

len of document 32


In [10]:
# print first chunk
print("content:")
print(documents[0].page_content)
print("metadata", documents[0].metadata)

content:
As a new user
I want to register using my email and a password
So that I can create an account and access the ProShop application.
metadata {'lang': 'en', 'epic_id': 'EP1', 'requirement_id': 'PSE1.1', 'requirement_name': 'ACCOUNT REGISTRATION', 'priority': 'MUST', 'section': 'Narrative'}


Load GEMINI API key for Embedding

In [11]:
import dotenv, os
import google.generativeai as genai

# import gemini api key
dotenv.load_dotenv()

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [12]:
# Embedding with gemini

def gemini_embedding(text: str):
    gemini_vectors = []
    for doc in tqdm(documents, desc="Embedding with Gemini API key"):
        vec = genai.embed_content(
            model="models/text-embedding-004",
            content=doc.page_content,
            task_type="retrieval_document"
        )["embedding"]

        gemini_vectors.append(vec)

    return gemini_vectors

gemini_vectors = gemini_embedding(documents)

Embedding with Gemini API key: 100%|██████████| 32/32 [00:08<00:00,  3.72it/s]


In [13]:
# print to check
print(f"Content document: {documents[0]}\n")
print(f"Vector embedded by gemini: {gemini_vectors[0]}")

Content document: page_content='As a new user
I want to register using my email and a password
So that I can create an account and access the ProShop application.' metadata={'lang': 'en', 'epic_id': 'EP1', 'requirement_id': 'PSE1.1', 'requirement_name': 'ACCOUNT REGISTRATION', 'priority': 'MUST', 'section': 'Narrative'}

Vector embedded by gemini: [-0.016118374, -0.015438314, -0.060080536, -0.03404929, -5.298718e-05, -0.0044829617, 0.027117886, -0.025493694, -0.020630755, 0.050175447, 0.002624665, 0.04828648, 0.05080244, 0.0042121303, -0.081448205, -0.008521596, 0.001611033, -0.043183915, -0.10477846, -0.011694005, -0.03242967, -0.006052731, 0.00604692, 0.002498733, 0.022697981, 0.0013315491, 0.015580812, 0.0065895696, -0.030603776, -0.029822154, 0.022304822, 0.03949643, -0.03584215, -0.02070631, -0.04858017, 0.00945696, 0.04873407, -0.016222613, 0.043825578, -0.035165895, -0.06632894, 0.045635737, -0.05283397, 0.024452178, -0.00831275, -0.003916008, -0.014037976, 0.03491843, -0.016715

Use BGE-M3 for Embedding

In [14]:
import ollama

def bge_embedding(text: str):
    bge_vectors = []
    for doc in tqdm(documents, desc="Embedding with BGE-M3"):
        res = ollama.embed(
            model="bge-m3",
            input=doc.page_content
        )
        bge_vectors.append(res['embeddings'][0])

    return bge_vectors

bge_vectors = bge_embedding(documents)

Embedding with BGE-M3: 100%|██████████| 32/32 [00:06<00:00,  4.71it/s]


In [15]:
print(f"Content document: {documents[0]}\n")
print(f"Vector embedded by bge-m3: {bge_vectors[0]}")

Content document: page_content='As a new user
I want to register using my email and a password
So that I can create an account and access the ProShop application.' metadata={'lang': 'en', 'epic_id': 'EP1', 'requirement_id': 'PSE1.1', 'requirement_name': 'ACCOUNT REGISTRATION', 'priority': 'MUST', 'section': 'Narrative'}

Vector embedded by bge-m3: [-0.010891429, 0.0016670952, -0.032353766, 0.00804597, -0.0020478128, -0.030039407, 1.0338755e-05, -0.012521027, -0.015871393, 0.012432631, 0.0020477634, -0.014606137, -0.047933586, -0.030587632, -0.024535874, 0.02066505, 0.011298056, -0.019164642, 0.047080215, -0.018755795, 0.02607665, -0.012508197, 0.03872013, -0.024538327, -0.0035561398, -0.011252207, 0.01861017, 0.006258897, 0.029848661, -0.05080404, 0.019260831, -0.027090665, -0.014340831, 0.0014816275, -0.016161403, -0.008083703, -0.018446542, 0.0004982031, -0.041036725, 0.024627108, -0.0023993007, -0.051660825, -0.006137051, -0.04492809, 0.0027980057, -0.018459465, -0.010521612, 0.0153

Using Qwen3 0.6b embedding

In [16]:
def qwen3_embedding(text: str):
    qwen3_vectors = []
    for doc in tqdm(documents, desc="Embedding with qwen3:0.6"):
        res = ollama.embed(
            model="qwen3-embedding:0.6b",
            input=doc.page_content
        )
        qwen3_vectors.append(res['embeddings'][0])
    return qwen3_vectors

qwen3_vectors = qwen3_embedding(documents)

Embedding with qwen3:0.6: 100%|██████████| 32/32 [00:03<00:00,  9.10it/s]


In [17]:
print(f"Content document: {documents[0]}\n")
print(f"Vector embedded by qwen3: {qwen3_vectors[0]}")

Content document: page_content='As a new user
I want to register using my email and a password
So that I can create an account and access the ProShop application.' metadata={'lang': 'en', 'epic_id': 'EP1', 'requirement_id': 'PSE1.1', 'requirement_name': 'ACCOUNT REGISTRATION', 'priority': 'MUST', 'section': 'Narrative'}

Vector embedded by qwen3: [-0.019141147, -0.05405345, -0.0028008805, -0.08329538, 0.013198454, -0.05539291, 0.053724978, 0.057799704, -0.033667162, 0.009225071, -0.031260494, 0.009029252, 0.010318268, -0.0022416636, -0.029521516, 0.04266203, -0.035191067, -0.0016826604, 0.032633107, 0.012011705, -0.0004937099, 0.0027120486, 0.019780299, 0.039326515, -0.027805544, -0.079710245, -0.017886616, 0.05847832, 0.039190423, -0.015507725, 0.040640704, -0.057615653, -0.022230882, -0.011754623, -0.033196475, -0.006012208, -0.018915936, -0.026341863, -0.012674084, 0.014988636, 0.027402135, -0.05745164, 0.107906215, 0.007767312, 0.053224828, 0.0065969364, -0.0319218, 0.039106835, 0.

Information of Vectors 

In [18]:
import pandas as pd

df = pd.DataFrame([
    ["Gemini", len(gemini_vectors), len(gemini_vectors[0]), "float"],
    ["BGE-M3", len(bge_vectors), len(bge_vectors[0]), "float"],
    ["Qwen3", len(qwen3_vectors), len(qwen3_vectors[0]), "float"],
], columns=["Model", "Num Vectors", "Vector Dim", "dtype"])

print(df)

    Model  Num Vectors  Vector Dim  dtype
0  Gemini           32         768  float
1  BGE-M3           32        1024  float
2   Qwen3           32        1024  float


Benchmark with 3 embedding models by Speed

In [19]:
import time
import numpy as np

# function for test speed
def benchmark_speed(func, texts, repeat=3):
    # print check
    print(f"Benchmarking speed of {func.__name__}")
    times = []
    for _ in range(repeat):
        start = time.time()
        func(texts)
        times.append(time.time() - start)
    return np.mean(times)

In [20]:
sample_sentences = [doc.page_content for doc in documents]

speed_gemini = benchmark_speed(gemini_embedding, sample_sentences)
speed_bge    = benchmark_speed(bge_embedding, sample_sentences)
speed_qwen   = benchmark_speed(qwen3_embedding, sample_sentences)

print("Gemini speed:", speed_gemini)
print("BGE-M3 speed:", speed_bge)
print("Qwen3 speed:", speed_qwen)

Benchmarking speed of gemini_embedding


Embedding with Gemini API key: 100%|██████████| 32/32 [00:06<00:00,  4.84it/s]
Embedding with Gemini API key: 100%|██████████| 32/32 [00:06<00:00,  4.79it/s]
Embedding with Gemini API key: 100%|██████████| 32/32 [00:07<00:00,  4.27it/s]


Benchmarking speed of bge_embedding


Embedding with BGE-M3: 100%|██████████| 32/32 [00:06<00:00,  5.07it/s]
Embedding with BGE-M3: 100%|██████████| 32/32 [00:06<00:00,  5.15it/s]
Embedding with BGE-M3: 100%|██████████| 32/32 [00:05<00:00,  5.50it/s]


Benchmarking speed of qwen3_embedding


Embedding with qwen3:0.6: 100%|██████████| 32/32 [00:04<00:00,  7.42it/s]
Embedding with qwen3:0.6: 100%|██████████| 32/32 [00:03<00:00,  8.39it/s]
Embedding with qwen3:0.6: 100%|██████████| 32/32 [00:04<00:00,  7.60it/s]

Gemini speed: 6.929312070210774
BGE-M3 speed: 6.116976420084636
Qwen3 speed: 4.1153543790181475





# Create vector database

Choose best embedding model for create vector store is 

 *qwen3*

In [21]:
from typing import List
from langchain_core.embeddings import Embeddings

class OllamaQwen3Embeddings(Embeddings):
    def __init__(self, model_name: str = "qwen3-embedding:0.6b"):
        self.model_name = model_name

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = []
        for text in tqdm(texts):
            # Gọi API local của bạn
            res = ollama.embed(model=self.model_name, input=text)
            embeddings.append(res['embeddings'][0])
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        res = ollama.embed(model=self.model_name, input=text)
        return res['embeddings'][0]

In [22]:
from langchain_chroma import Chroma
from uuid import uuid4

def create_chroma_db(documents, vectors, persist_dir="chroma_db"):
    texts = [doc.page_content for doc in documents]
    metadatas = [doc.metadata for doc in documents]
    ids = [str(uuid4()) for _ in range(len(documents))]

    vectordb = Chroma(
        collection_name="RAG_testcase",
        embedding_function=OllamaQwen3Embeddings(),
        persist_directory=persist_dir
    )

    vectordb.add_texts(
        texts=texts,
        metadatas=metadatas,
        ids=ids,
        embeddings=vectors
    )
    
    return vectordb

vectordb = create_chroma_db(documents=documents, vectors=qwen3_vectors)

100%|██████████| 32/32 [00:03<00:00,  8.25it/s]


In [24]:
current_dim = vectordb._collection.peek()['embeddings'].shape[1]
print(f"Dimension trong DB: {current_dim}") 

Dimension trong DB: 1024


Test Query

In [49]:
import json
query_test = "As a logged-in user I want to update my profile information So that I can keep my personal details up to date."

results = vectordb.similarity_search(query=query_test, k=5)

for result in results:
    print(json.dumps(result.metadata, indent=2, ensure_ascii=False))

{
  "epic_id": "EP1",
  "lang": "en",
  "requirement_name": "EDIT PROFILE",
  "requirement_id": "PSE1.4",
  "priority": "MUST",
  "section": "Narrative"
}
{
  "requirement_id": "PSE1.3",
  "epic_id": "EP1",
  "lang": "en",
  "priority": "MUST",
  "requirement_name": "VIEW PROFILE",
  "section": "Narrative"
}
{
  "epic_id": "EP1",
  "section": "Narrative",
  "requirement_id": "PSE1.5",
  "requirement_name": "CHANGE PASSWORD",
  "lang": "en",
  "priority": "MUST"
}
{
  "requirement_name": "VIEW PROFILE",
  "epic_id": "EP1",
  "priority": "MUST",
  "lang": "en",
  "section": "In Scope",
  "requirement_id": "PSE1.3"
}
{
  "epic_id": "EP1",
  "priority": "MUST",
  "lang": "en",
  "requirement_id": "PSE1.4",
  "requirement_name": "EDIT PROFILE",
  "section": "Out of Scope"
}


# Retriever 

In [56]:
from typing import List, Dict

def retrieve_documents(vectordb, query: str, k: int = 5):
    print(f"Query input: {query}")

    results = vectordb.similarity_search(query, k=k)

    if not results:
        print("No documents found.")
        return []

    target_id = set()
    for doc in results:
        req_id = doc.metadata.get("requirement_id")
        if req_id:
            target_id.add(req_id)
    
    print(f"Target IDs found: {target_id}")

    final_context = []
    collection = vectordb._collection

    for req_id in target_id:
        expanded_data = collection.get(
            where={"requirement_id": req_id},
            include=["documents", "metadatas"]
        )

        ids = expanded_data['ids']
        texts = expanded_data['documents']
        metas = expanded_data['metadatas']
        
        if not texts: 
            continue

        zipped_docs = list(zip(texts, metas))
        
        for text, meta in zipped_docs:
            formatted_content = f"Requirement ID: {meta['requirement_id']}\nSection: {meta['section']}\nContent: {text}"
            final_context.append(formatted_content)
                
    return final_context

In [57]:
context_list = retrieve_documents(vectordb=vectordb, query=query_test)

final_context_text = f"\n{'-' * 20}\n".join(context_list)
final_context_text = overview + "\n\n" + final_context_text

print("\n=== CONTEXT CUỐI CÙNG GỬI CHO LLM ===")
print(final_context_text)

Query input: As a logged-in user I want to update my profile information So that I can keep my personal details up to date.
Target IDs found: {'PSE1.4', 'PSE1.5', 'PSE1.3'}

=== CONTEXT CUỐI CÙNG GỬI CHO LLM ===
This epic focuses on the development of a user management system that allows users to register, log in, manage their profiles, and access the application securely using email and password authentication. The system will ensure that users can create and maintain their accounts with ease while providing the necessary security features to protect user data.

Requirement ID: PSE1.4
Section: Narrative
Content: As a logged-in user
I want to update my profile information
So that I can keep my personal details up to date.
--------------------
Requirement ID: PSE1.4
Section: In Scope
Content: Editable fields:
Full name
Email address
Form validation:
Validate all inputs before submission
Valid email format
--------------------
Requirement ID: PSE1.4
Section: Out of Scope
Content: Passwor

# LLMs