# Breaking down the document into chunks

In [1]:
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

def chunk_by_sentences(text, max_tokens=20):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    total_tokens = 0

    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        token_count = len(tokens)

        if total_tokens + token_count > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            total_tokens = 0

        current_chunk.append(sentence)
        total_tokens += token_count

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

full_text = """Ayutthaya, located in the central plains of Thailand, is a city steeped in history and cultural significance. 
              Established in 1350 by King U Thong, it became the second capital of the Siamese Kingdom after Sukhothai. 
              Over the course of more than four centuries, Ayutthaya flourished as one of the world’s most important trading and diplomatic hubs. 
              Its strategic location between China, India, and the Malay Archipelago allowed it to attract merchants from Europe, the Middle East, and across Asia. 
              The city was renowned for its grandeur, boasting impressive temples, royal palaces, and advanced urban planning, all surrounded by a network of rivers that served as natural defenses.
              Ayutthaya was a melting pot of cultures, where Thai, Khmer, Chinese, Japanese, Portuguese, Dutch, and French communities lived and traded side by side. 
              Its architecture reflected this diversity, blending traditional Thai styles with foreign influences in everything from temples to fortifications. 
              Unfortunately, in 1767, the city fell to Burmese invaders, who looted and burned much of it to the ground. 
              Despite this devastation, the ruins of Ayutthaya still stand as a powerful reminder of the city’s former glory. 
              Today, the Ayutthaya Historical Park, a UNESCO World Heritage Site, preserves many of the ancient structures, such as Wat Phra Si Sanphet, Wat Mahathat, and the iconic Buddha head entwined in tree roots. 
              These ruins attract visitors from around the world who come to witness the beauty and historical depth of a civilization that once stood at the crossroads of global trade and diplomacy.
            """

chunks = chunk_by_sentences(full_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
chunks

['',
 'Ayutthaya, located in the central plains of Thailand, is a city steeped in history and cultural significance.',
 'Established in 1350 by King U Thong, it became the second capital of the Siamese Kingdom after Sukhothai.',
 'Over the course of more than four centuries, Ayutthaya flourished as one of the world’s most important trading and diplomatic hubs.',
 'Its strategic location between China, India, and the Malay Archipelago allowed it to attract merchants from Europe, the Middle East, and across Asia.',
 'The city was renowned for its grandeur, boasting impressive temples, royal palaces, and advanced urban planning, all surrounded by a network of rivers that served as natural defenses.',
 'Ayutthaya was a melting pot of cultures, where Thai, Khmer, Chinese, Japanese, Portuguese, Dutch, and French communities lived and traded side by side.',
 'Its architecture reflected this diversity, blending traditional Thai styles with foreign influences in everything from temples to forti

In [3]:
print("Number of chunks:", len(chunks))

Number of chunks: 12


# Indexing

In [5]:
import os
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from collections import defaultdict
import numpy as np

# Assuming `chunk_metadata` is a list of dicts like:
# [{"doc_id": "doc1", "section": "intro", "text": "..."}, ...]
# Or at minimum: [{"doc_id": "doc1", "text": "..."}, ...]

# Example: simulate grouped chunks
chunk_metadata = [{"doc_id": f"doc_{i//5}", "text": chunk} for i, chunk in enumerate(chunks)]

# Step 1: Group by document
grouped_docs = defaultdict(list)
for meta in chunk_metadata:
    grouped_docs[meta["doc_id"]].append(meta["text"])

# Step 2: Load embedder
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Step 3: Build per-document (Level 1) and chunk-level (Level 2) FAISS indexes
level1_doc_ids = []
level1_doc_embeddings = []

level2_indexes = {}  # doc_id -> FAISS index
level2_chunks = {}   # doc_id -> chunk list

for doc_id, chunk_list in grouped_docs.items():
    # Level 2: Chunk-level index
    chunk_embeddings = embedder.encode(chunk_list, convert_to_numpy=True)
    dim = chunk_embeddings.shape[1]
    idx = faiss.IndexFlatL2(dim)
    idx.add(chunk_embeddings)
    level2_indexes[doc_id] = idx
    level2_chunks[doc_id] = chunk_list

    # Level 1: Document-level representation (mean embedding)
    doc_embedding = chunk_embeddings.mean(axis=0)
    level1_doc_embeddings.append(doc_embedding)
    level1_doc_ids.append(doc_id)

# Step 4: Save Level 1 index
dim = level1_doc_embeddings[0].shape[0]
level1_index = faiss.IndexFlatL2(dim)
level1_index.add(np.array(level1_doc_embeddings))

faiss.write_index(level1_index, "level1_index.faiss")
with open("level1_doc_ids.pkl", "wb") as f:
    pickle.dump(level1_doc_ids, f)

# Step 5: Save Level 2 indexes
for doc_id, idx in level2_indexes.items():
    faiss.write_index(idx, f"level2_index__{doc_id}.faiss")
    with open(f"level2_chunks__{doc_id}.pkl", "wb") as f:
        pickle.dump(level2_chunks[doc_id], f)

print("✅ Two-level indexing complete.")


✅ Two-level indexing complete.


In [6]:
doc_embedding.shape

(384,)

# Retrieval and Generation

In [8]:
import os
import torch
import pickle
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

# Load Level 1 document index and IDs
level1_index = faiss.read_index("level1_index.faiss")
with open("level1_doc_ids.pkl", "rb") as f:
    level1_doc_ids = pickle.load(f)

# Load embedder
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Load local LLM (TinyLlama or any other)
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Step 1: Hierarchical Retrieval
def retrieve(query, top_docs=3, top_chunks=2):
    # Embed the query
    query_embedding = embedder.encode([query], convert_to_numpy=True)

    # Level 1: retrieve top documents
    _, doc_indices = level1_index.search(query_embedding, top_docs)
    selected_doc_ids = [level1_doc_ids[i] for i in doc_indices[0]]

    retrieved_chunks = []

    for doc_id in selected_doc_ids:
        # Load Level 2 index and chunks for that document
        level2_index = faiss.read_index(f"level2_index__{doc_id}.faiss")
        with open(f"level2_chunks__{doc_id}.pkl", "rb") as f:
            doc_chunks = pickle.load(f)

        # Level 2: search for best chunks within this document
        _, chunk_indices = level2_index.search(query_embedding, top_chunks)
        retrieved_chunks.extend([doc_chunks[i] for i in chunk_indices[0]])

    return retrieved_chunks

# Step 2: Generation
def generate_answer(query, context_docs):
    context = "\n\n".join(context_docs)
    prompt = f"""[INST] Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion: {query} [/INST]"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=False,
        temperature=0.3
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("[/INST]")[-1].strip()

# Step 3: Run example
query = "When was Ayutthaya established and by whom ?"
retrieved = retrieve(query)
answer = generate_answer(query, retrieved)

print("🔍 Retrieved Chunks:\n", "\n---\n".join(retrieved))
print("\n🧠 Answer:\n", answer)


Some parameters are on the meta device because they were offloaded to the cpu.


🔍 Retrieved Chunks:
 Ayutthaya, located in the central plains of Thailand, is a city steeped in history and cultural significance.
---
Over the course of more than four centuries, Ayutthaya flourished as one of the world’s most important trading and diplomatic hubs.
---
Ayutthaya was a melting pot of cultures, where Thai, Khmer, Chinese, Japanese, Portuguese, Dutch, and French communities lived and traded side by side.
---
Despite this devastation, the ruins of Ayutthaya still stand as a powerful reminder of the city’s former glory.
---
Today, the Ayutthaya Historical Park, a UNESCO World Heritage Site, preserves many of the ancient structures, such as Wat Phra Si Sanphet, Wat Mahathat, and the iconic Buddha head entwined in tree roots.
---
These ruins attract visitors from around the world who come to witness the beauty and historical depth of a civilization that once stood at the crossroads of global trade and diplomacy.

🧠 Answer:
 Answers:

1. When was Ayutthaya established and by 