# Breaking down the document into chunks

In [24]:
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

def chunk_by_sentences(text, max_tokens=20):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    total_tokens = 0

    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        token_count = len(tokens)

        if total_tokens + token_count > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            total_tokens = 0

        current_chunk.append(sentence)
        total_tokens += token_count

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

full_text = """Ayutthaya, located in the central plains of Thailand, is a city steeped in history and cultural significance. 
              Established in 1350 by King U Thong, it became the second capital of the Siamese Kingdom after Sukhothai. 
              Over the course of more than four centuries, Ayutthaya flourished as one of the world’s most important trading and diplomatic hubs. 
              Its strategic location between China, India, and the Malay Archipelago allowed it to attract merchants from Europe, the Middle East, and across Asia. 
              The city was renowned for its grandeur, boasting impressive temples, royal palaces, and advanced urban planning, all surrounded by a network of rivers that served as natural defenses.
              Ayutthaya was a melting pot of cultures, where Thai, Khmer, Chinese, Japanese, Portuguese, Dutch, and French communities lived and traded side by side. 
              Its architecture reflected this diversity, blending traditional Thai styles with foreign influences in everything from temples to fortifications. 
              Unfortunately, in 1767, the city fell to Burmese invaders, who looted and burned much of it to the ground. 
              Despite this devastation, the ruins of Ayutthaya still stand as a powerful reminder of the city’s former glory. 
              Today, the Ayutthaya Historical Park, a UNESCO World Heritage Site, preserves many of the ancient structures, such as Wat Phra Si Sanphet, Wat Mahathat, and the iconic Buddha head entwined in tree roots. 
              These ruins attract visitors from around the world who come to witness the beauty and historical depth of a civilization that once stood at the crossroads of global trade and diplomacy.
            """

chunks = chunk_by_sentences(full_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
chunks

['',
 'Ayutthaya, located in the central plains of Thailand, is a city steeped in history and cultural significance.',
 'Established in 1350 by King U Thong, it became the second capital of the Siamese Kingdom after Sukhothai.',
 'Over the course of more than four centuries, Ayutthaya flourished as one of the world’s most important trading and diplomatic hubs.',
 'Its strategic location between China, India, and the Malay Archipelago allowed it to attract merchants from Europe, the Middle East, and across Asia.',
 'The city was renowned for its grandeur, boasting impressive temples, royal palaces, and advanced urban planning, all surrounded by a network of rivers that served as natural defenses.',
 'Ayutthaya was a melting pot of cultures, where Thai, Khmer, Chinese, Japanese, Portuguese, Dutch, and French communities lived and traded side by side.',
 'Its architecture reflected this diversity, blending traditional Thai styles with foreign influences in everything from temples to forti

In [26]:
print("Number of chunks:", len(chunks))

Number of chunks: 12


# Indexing

In [27]:
import os
import faiss
import pickle
from sentence_transformers import SentenceTransformer

# Step 1: Prepare your document chunks (load or define here)

documents = chunks  # Each chunk is a "document" in this simple setup

# Step 2: Embed the documents
print("🔍 Loading embedding model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Fast & good quality

print("🔢 Encoding documents...")
document_embeddings = embedder.encode(documents, convert_to_numpy=True, show_progress_bar=True)

# Step 3: Create and populate FAISS index
dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance
index.add(document_embeddings)

# Step 4: Save index and documents for retrieval + reranking
print("💾 Saving FAISS index and document chunks...")
faiss.write_index(index, "rag_index.faiss")

with open("rag_documents.pkl", "wb") as f:
    pickle.dump(documents, f)

print("✅ Indexing complete. Rerank-ready!")


🔍 Loading embedding model...
🔢 Encoding documents...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

💾 Saving FAISS index and document chunks...
✅ Indexing complete. Rerank-ready!


In [28]:
document_embeddings.shape

(12, 384)

# Retrieval and Generation

In [29]:
import torch
import pickle
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, CrossEncoder

# Step 1: Load FAISS index and documents
index = faiss.read_index("rag_index.faiss")
with open("rag_documents.pkl", "rb") as f:
    documents = pickle.load(f)

# Step 2: Load embedding and reranker models
embedder = SentenceTransformer('all-MiniLM-L6-v2')
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')  # Reranking model

# Step 3: Load local LLM
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# -------------------------------
# Step 4: Retrieval + Reranking
# -------------------------------
def retrieve(query, top_k=10, final_k=5):
    # Coarse retrieval via FAISS
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    _, indices = index.search(query_embedding, top_k)
    retrieved_chunks = [documents[i] for i in indices[0]]

    # Rerank via CrossEncoder
    pairs = [(query, chunk) for chunk in retrieved_chunks]
    scores = reranker.predict(pairs)

    # Sort by score (descending)
    reranked = sorted(zip(retrieved_chunks, scores), key=lambda x: x[1], reverse=True)
    top_chunks = [chunk for chunk, _ in reranked[:final_k]]

    # Print retrieved chunks with scores
    print("🔍 Top Retrieved Chunks (after reranking):")
    for i, (chunk, score) in enumerate(reranked[:final_k]):
        print(f"\n--- Rank {i+1} (Score: {score:.4f}) ---\n{chunk.strip()}\n")

    return top_chunks

# -------------------------------
# Step 5: Answer Generation
# -------------------------------
def generate_answer(query, context_docs):
    context = "\n\n".join(context_docs)
    prompt = f"""[INST] Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion: {query} [/INST]"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=False,
        temperature=0.3
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("[/INST]")[-1].strip()

# -------------------------------
# Step 6: Run RAG Query
# -------------------------------

query = "What was the first capital city before Ayutthaya?"
retrieved = retrieve(query)
answer = generate_answer(query, retrieved)

print("🧠 Answer:\n", answer)

Some parameters are on the meta device because they were offloaded to the cpu.


🔍 Top Retrieved Chunks (after reranking):

--- Rank 1 (Score: 1.6141) ---
Ayutthaya, located in the central plains of Thailand, is a city steeped in history and cultural significance.


--- Rank 2 (Score: -1.0811) ---
Over the course of more than four centuries, Ayutthaya flourished as one of the world’s most important trading and diplomatic hubs.


--- Rank 3 (Score: -1.1523) ---
Ayutthaya was a melting pot of cultures, where Thai, Khmer, Chinese, Japanese, Portuguese, Dutch, and French communities lived and traded side by side.


--- Rank 4 (Score: -1.6424) ---
Despite this devastation, the ruins of Ayutthaya still stand as a powerful reminder of the city’s former glory.


--- Rank 5 (Score: -3.4244) ---
Today, the Ayutthaya Historical Park, a UNESCO World Heritage Site, preserves many of the ancient structures, such as Wat Phra Si Sanphet, Wat Mahathat, and the iconic Buddha head entwined in tree roots.

🧠 Answer:
 Answer: The second capital city after Ayutthaya was Chiang Mai.

Con