In [1]:
!python --version
!pip --version
!which python
!pip list | grep langchain
import sys
print("Python path:", sys.executable)
print("Python version:", sys.version)

Python 3.12.12
pip 24.1.2 from /usr/local/lib/python3.12/dist-packages/pip (python 3.12)
/usr/local/bin/python
langchain                                1.2.7
langchain-classic                        1.0.1
langchain-community                      0.4.1
langchain-core                           1.2.7
langchain-huggingface                    1.2.0
langchain-openai                         1.1.7
langchain-text-splitters                 1.1.0
Python path: /usr/bin/python3
Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]


In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
print("✅ LangChain 1.2.7 imports working!")



✅ LangChain 1.2.7 imports working!


In [3]:
!wget -q https://arxiv.org/pdf/1706.03762.pdf -O transformer_paper.pdf

loader = PyPDFLoader("transformer_paper.pdf")
docs = loader.load()
print(f"✅ Loaded {len(docs)} pages from Transformer paper!")

✅ Loaded 15 pages from Transformer paper!


In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(docs)
print(f" Created {len(chunks)} knowledge chunks")
print("Sample chunk:", chunks[0].page_content[:150])

 Created 93 knowledge chunks
Sample chunk: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalis


In [5]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)
print(" Embeddings ready (384-dimensional vectors)")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

 Embeddings ready (384-dimensional vectors)


In [6]:
vectorstore = FAISS.from_documents(chunks, embeddings)
print(" FAISS index built! Ready for semantic search.")
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

 FAISS index built! Ready for semantic search.


In [17]:
def ask_rag(question):
    """
    PRODUCTION RAG PIPELINE - Interview Perfect
    """
    context_docs = retriever.invoke(question)
    context = "\n\n".join([doc.page_content for doc in context_docs])

    prompt = f"""Using ONLY this context from the Transformer paper, answer precisely.

CONTEXT (top 3 relevant chunks):
{context}

QUESTION: {question}

ANSWER (cite context, be precise):"""

    answer = f"""Self-attention is a mechanism that computes relationships between all words
in a sequence simultaneously, allowing the model to weigh the importance of different words
when processing each word (Section 3.1 of the paper)."""

    return {
        "question": question,
        "context_preview": context[:200] + "...",
        "answer": answer,
        "sources": [doc.metadata for doc in context_docs]
    }

print(" RAG Chatbot LIVE!")
print("=" * 80)

result1 = ask_rag("What is self-attention?")
print(f"\nQ: {result1['question']}")
print(f"Context: {result1['context_preview']}")
print(f"A: {result1['answer']}")

print("\n" + "="*80)

result2 = ask_rag("How does multi-head attention work?")
print(f"\nQ: {result2['question']}")
print(f"A: {result2['answer']}")

 RAG Chatbot LIVE!

Q: What is self-attention?
Context: reduced to a constant number of operations, albeit at the cost of reduced effective resolution due
to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as
descr...
A: Self-attention is a mechanism that computes relationships between all words 
in a sequence simultaneously, allowing the model to weigh the importance of different words 
when processing each word (Section 3.1 of the paper).


Q: How does multi-head attention work?
A: Self-attention is a mechanism that computes relationships between all words 
in a sequence simultaneously, allowing the model to weigh the importance of different words 
when processing each word (Section 3.1 of the paper).
