In [None]:
!pip install langchain chromadb transformers sentence-transformers pdfminer.six PyPDF2 --quiet


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m15.5 MB/s[0m eta [36m0:00:

In [None]:
from sentence_transformers import SentenceTransformer
import chromadb

class VectorStore:
    def __init__(self, collection_name="rag_docs"):
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection(collection_name)
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.page_map = {}

    def add_documents(self, docs_with_pages):
        texts, pages = zip(*docs_with_pages)
        embeddings = self.embedder.encode(texts).tolist()
        ids = [str(i) for i in range(len(texts))]
        self.collection.add(documents=texts, embeddings=embeddings, ids=ids)
        self.page_map.update({id_: page for id_, page in zip(ids, pages)})

    def retrieve(self, query, top_k=5):
        query_embedding = self.embedder.encode([query])[0].tolist()
        results = self.collection.query(query_embeddings=[query_embedding], n_results=top_k)
        docs = results['documents'][0]
        ids = results['ids'][0]
        pages = [self.page_map.get(doc_id, '?') for doc_id in ids]
        return list(zip(docs, pages))


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_split_pdf(pdf_path):
    from PyPDF2 import PdfReader
    reader = PdfReader(pdf_path)
    chunks_with_pages = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text:
            continue
        for chunk in splitter.split_text(text):
            chunks_with_pages.append((chunk, i + 1))
    return chunks_with_pages


In [None]:
from google.colab import files
uploaded = files.upload()
pdf_paths = list(uploaded.keys())


Saving 2005.14165v4.pdf to 2005.14165v4 (2).pdf
Saving 2005.11401v4.pdf to 2005.11401v4 (2).pdf
Saving 1706.03762v7.pdf to 1706.03762v7 (2).pdf


In [None]:
vs = VectorStore()

for path in pdf_paths:
    chunks = load_and_split_pdf(path)
    vs.add_documents(chunks)

print(f"✅ Indexed {len(pdf_paths)} paper(s).")


✅ Indexed 3 paper(s).


In [None]:
from transformers import pipeline

qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_answer(contexts_with_pages, question):
    context = "\n\n".join([f"[p.{p}] {c}" for c, p in contexts_with_pages])
    prompt = f"Answer the question using the context.\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
    result = qa_pipeline(prompt, max_new_tokens=200)[0]['generated_text']
    return result, [p for _, p in contexts_with_pages]


Device set to use cpu


In [None]:
# 👇 Replace the question with your own
question = "What are the two sub-layers in each encoder layer of the Transformer model?"

# Retrieve relevant context and generate answer
top_docs = vs.retrieve(question)
answer, source_pages = generate_answer(top_docs, question)

# Display
print(f"📌 Question: {question}")
print(f"\n✅ Answer: {answer}")
print(f"\n📄 Source Pages: {sorted(set(source_pages))}")


Token indices sequence length is longer than the specified maximum sequence length for this model (608 > 512). Running this sequence through the model will result in indexing errors


📌 Question: What are the two sub-layers in each encoder layer of the Transformer model?

✅ Answer: nparams is the total number of trainable parameters, nlayers is the total number of layers, dmodel is the number of units in each bottleneck layer (we always have the feedforward layer four times the size of the bottleneck layer, d= 4dmodel ), anddhead is the dimension of each [p.39] used to produce 100 billion parameter models and more recently 50 billion parameter translation models [ AJF19 ], though only a small fraction of the parameters are actually used on each forward pass.

📄 Source Pages: [9, 39, 46, 69]
