#### Environment


In [29]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

###### API Keys


In [30]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

In [None]:
import tiktoken
from langchain_core.documents import Document


def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
    enc = tiktoken.get_encoding(encoding_name=encoding_name)
    return len(enc.encode(text=text))


def format_documents(docs: list[Document]) -> str:
    combined = "\n\n".join(doc.page_content for doc in docs)
    return combined

1. Overview (2-3-4)
2. Indexing
3. Retrieval
4. Generation

In [34]:
# imports
from langchain import hub
from langchain.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [27]:
query = "What is Attention is all you need?"

### Indexing

In [None]:
# Load documents
docs = PyPDFLoader("../documents/NIPS-2017-attention-is-all-you-need-Paper.pdf").load()

# Split
split_docs = RecursiveCharacterTextSplitter(
    chunk_size=1500, chunk_overlap=200
).split_documents(documents=docs)

combined = format_documents(docs=split_docs)
tokens = count_tokens(text=combined)

print(f"Total Tokens: {tokens}")

Total Tokens: 8952


### Retrieval

In [28]:
# Embed
vector_store = Chroma.from_documents(documents=split_docs, embedding=OpenAIEmbeddings())
retriever = vector_store.as_retriever(search_kwargs={"k": 1})

docs = retriever.get_relevant_documents(query=query)

print(len(docs))

1


### Generation

In [32]:
# prompt
prompt_from_hub = hub.pull("rlm/rag-prompt")
print(prompt_from_hub)

input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [35]:
template = """Answer the question based on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template=template)
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})]


In [37]:
# LLM
llm = ChatOpenAI(model="gpt-5", temperature=0)

In [36]:
# chain
chain = prompt | llm

In [39]:
res = chain.invoke({"context": docs, "question": query})
print(res.content)

“Attention Is All You Need” is a 2017 NeurIPS paper by Vaswani et al. that introduced the Transformer: an encoder–decoder neural network built entirely on attention (no recurrence or convolutions). It showed superior machine translation quality, greater parallelism, and faster training than prior RNN/CNN-based models, achieving state-of-the-art BLEU scores on WMT14 English–German and English–French.


In [40]:
rag_chain = (
    {"context": retriever | format_documents, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

response = rag_chain.invoke(query)

print(response)

“Attention Is All You Need” is a 2017 research paper by Vaswani et al. that introduces the Transformer, a sequence-to-sequence neural network architecture based entirely on attention (no recurrence or convolutions). It showed superior translation quality, better parallelism, and faster training, achieving state-of-the-art BLEU scores on WMT’14 English–German and English–French tasks.
