# Breaking down the document into chunks

In [12]:
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

def chunk_by_sentences(text, max_tokens=20):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    total_tokens = 0

    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        token_count = len(tokens)

        if total_tokens + token_count > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            total_tokens = 0

        current_chunk.append(sentence)
        total_tokens += token_count

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

full_text = """Ayutthaya, located in the central plains of Thailand, is a city steeped in history and cultural significance. 
              Established in 1350 by King U Thong, it became the second capital of the Siamese Kingdom after Sukhothai. 
              Over the course of more than four centuries, Ayutthaya flourished as one of the world’s most important trading and diplomatic hubs. 
              Its strategic location between China, India, and the Malay Archipelago allowed it to attract merchants from Europe, the Middle East, and across Asia. 
              The city was renowned for its grandeur, boasting impressive temples, royal palaces, and advanced urban planning, all surrounded by a network of rivers that served as natural defenses.
              Ayutthaya was a melting pot of cultures, where Thai, Khmer, Chinese, Japanese, Portuguese, Dutch, and French communities lived and traded side by side. 
              Its architecture reflected this diversity, blending traditional Thai styles with foreign influences in everything from temples to fortifications. 
              Unfortunately, in 1767, the city fell to Burmese invaders, who looted and burned much of it to the ground. 
              Despite this devastation, the ruins of Ayutthaya still stand as a powerful reminder of the city’s former glory. 
              Today, the Ayutthaya Historical Park, a UNESCO World Heritage Site, preserves many of the ancient structures, such as Wat Phra Si Sanphet, Wat Mahathat, and the iconic Buddha head entwined in tree roots. 
              These ruins attract visitors from around the world who come to witness the beauty and historical depth of a civilization that once stood at the crossroads of global trade and diplomacy.
            """

chunks = chunk_by_sentences(full_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
chunks

['',
 'Ayutthaya, located in the central plains of Thailand, is a city steeped in history and cultural significance.',
 'Established in 1350 by King U Thong, it became the second capital of the Siamese Kingdom after Sukhothai.',
 'Over the course of more than four centuries, Ayutthaya flourished as one of the world’s most important trading and diplomatic hubs.',
 'Its strategic location between China, India, and the Malay Archipelago allowed it to attract merchants from Europe, the Middle East, and across Asia.',
 'The city was renowned for its grandeur, boasting impressive temples, royal palaces, and advanced urban planning, all surrounded by a network of rivers that served as natural defenses.',
 'Ayutthaya was a melting pot of cultures, where Thai, Khmer, Chinese, Japanese, Portuguese, Dutch, and French communities lived and traded side by side.',
 'Its architecture reflected this diversity, blending traditional Thai styles with foreign influences in everything from temples to forti

In [14]:
print("Number of chunks:", len(chunks))

Number of chunks: 12


# Indexing

In [15]:
import os
import faiss
import pickle
from sentence_transformers import SentenceTransformer

documents = chunks

# Step 2: Embed the documents
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Small, fast model
document_embeddings = embedder.encode(documents, convert_to_numpy=True)

# Step 3: Create FAISS index
dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance
index.add(document_embeddings)

# Step 4: Save index and documents for retrieval
faiss.write_index(index, "rag_index.faiss")
with open("rag_documents.pkl", "wb") as f:
    pickle.dump(documents, f)

print("✅ Indexing complete. You can now use this index for retrieval.")


✅ Indexing complete. You can now use this index for retrieval.


In [16]:
document_embeddings.shape

(12, 384)

**How it works inside**

In [11]:
import faiss
import numpy as np

# Dimension of your vectors
d = 3

# Your vectors (converted to float32 as Faiss expects)
vector1 = np.array([1, 2, 2], dtype='float32')
vector2 = np.array([2, 3, 4], dtype='float32')
vector3 = np.array([0, 1, 0], dtype='float32')

# Create a batch of vectors to add
xb = np.array([vector1, vector2, vector3])

# Create a simple Faiss index (IndexFlatL2 for brute-force Euclidean distance)
index = faiss.IndexFlatL2(d)

# Add the vectors to the index
index.add(xb)

print(f"Number of vectors in the index: {index.ntotal}")

# Now, let's search for a query
query_vector = np.array([[1.1, 2.1, 2.2]], dtype='float32')

k = 1 # We want the 1 nearest neighbor
distances, faiss_internal_ids = index.search(query_vector, k)

print(f"\nQuery: {query_vector[0]}")
retrieved_id = faiss_internal_ids[0][0]
print(f"Nearest neighbor's Faiss internal ID: {retrieved_id}")
print(f"Distance to nearest neighbor: {distances[0][0]}")

# --- THE CORRECT FIX IS HERE ---
# Call reconstruct with only the ID. It returns the vector.
reconstructed_vec = index.reconstruct(int(retrieved_id))
# --- END OF CORRECT FIX ---

print(f"Reconstructed vector: {reconstructed_vec}")
print(f"Original vector1: {vector1}")

Number of vectors in the index: 3

Query: [1.1 2.1 2.2]
Nearest neighbor's Faiss internal ID: 0
Distance to nearest neighbor: 0.06000000238418579
Reconstructed vector: [1. 2. 2.]
Original vector1: [1. 2. 2.]


# Retrieval and Generation

In [20]:
import torch
import pickle
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

# Step 1: Load FAISS index and documents
index = faiss.read_index("rag_index.faiss")
with open("rag_documents.pkl", "rb") as f:
    documents = pickle.load(f)

# Step 2: Load embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Step 3: Load local LLM (e.g., Mistral, TinyLlama)
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Replace with any other local model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Step 4: Retrieval
def retrieve(query, top_k=5):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    return [documents[i] for i in indices[0]]

# Step 5: Generation
def generate_answer(query, context_docs):
    context = "\n\n".join(context_docs)
    prompt = f"""[INST] Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion: {query} [/INST]"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=False,
        temperature=0.3
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("[/INST]")[-1].strip()

# Step 6: Run an example
query = "When was ayutthaya established ?"
retrieved = retrieve(query)
answer = generate_answer(query, retrieved)

print("🔍 Retrieved Chunks:\n", "\n---\n".join(retrieved))
print("\n🧠 Answer:\n", answer)


Some parameters are on the meta device because they were offloaded to the cpu.


🔍 Retrieved Chunks:
 Ayutthaya, located in the central plains of Thailand, is a city steeped in history and cultural significance.
---
Over the course of more than four centuries, Ayutthaya flourished as one of the world’s most important trading and diplomatic hubs.
---
Ayutthaya was a melting pot of cultures, where Thai, Khmer, Chinese, Japanese, Portuguese, Dutch, and French communities lived and traded side by side.
---
Despite this devastation, the ruins of Ayutthaya still stand as a powerful reminder of the city’s former glory.
---
Today, the Ayutthaya Historical Park, a UNESCO World Heritage Site, preserves many of the ancient structures, such as Wat Phra Si Sanphet, Wat Mahathat, and the iconic Buddha head entwined in tree roots.

🧠 Answer:
 Answer: Ayutthaya was established in 1350 by King Ramathibodi I.
