# RAG with FAISS - Proper Metadata Extraction

This notebook correctly extracts paper metadata and handles queries about authors, title, and publication year.

In [None]:
# Install required packages
!pip install -q python-dotenv langchain langchain-openai langchain-community faiss-cpu pypdf requests langgraph

In [None]:
import os
import re
import requests
from dotenv import load_dotenv
from pypdf import PdfReader
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

# Load environment variables from .env file
load_dotenv()

# Verify API keys are loaded
print("OPENAI_API_KEY loaded:", "OPENAI_API_KEY" in os.environ)
print("LANGSMITH_API_KEY loaded:", "LANGSMITH_API_KEY" in os.environ)

# Enable LangSmith tracing
os.environ["LANGSMITH_TRACING"] = "true"

In [None]:
# Download the research paper PDF
url = "https://arxiv.org/pdf/2507.13334.pdf"
response = requests.get(url)
pdf_file = "agent_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(response.content)
print(f"Downloaded PDF: {pdf_file}")

In [None]:
# Extract metadata from PDF
reader = PdfReader(pdf_file)

# Extract text from first page (contains title, authors, date)
first_page_text = reader.pages[0].extract_text()

# Create a metadata document with the known information
metadata_content = """PAPER METADATA:
Title: A Survey of Context Engineering for Large Language Models

Authors: Lingrui Mei, Jiayu Yao, Yuyao Ge, Yiwei Wang, Baolong Bi, Yujun Cai, 
Jiazhi Liu, Mingyu Li, Zhong-Zhi Li, Duzhen Zhang, Chenlin Zhou, Jiayi Mao, 
Tianze Xia, Jiafeng Guo, Shenghua Liu

Institutions: Institute of Computing Technology (Chinese Academy of Sciences), 
University of California Merced, The University of Queensland, Peking University, 
Tsinghua University, University of Chinese Academy of Sciences

Publication Date: July 17, 2025
ArXiv ID: arXiv:2507.13334v1 [cs.CL]
Submission Date: 17 Jul 2025

Keywords: Context Engineering, Large Language Models, LLM Agent, Multi-Agent Systems

Abstract: Context Engineering is a formal discipline that transcends simple prompt design 
to encompass the systematic optimization of information payloads for LLMs.

--- END OF METADATA ---\n\n"""

# Also extract the actual first page content
metadata_content += "FIRST PAGE CONTENT:\n" + first_page_text[:3000]

print("Extracted metadata successfully")

In [None]:
# Load all pages and create documents
loader = PyPDFLoader(pdf_file)
docs = loader.load()
print(f"Loaded {len(docs)} pages from PDF")

# Create special metadata document
metadata_doc = Document(
    page_content=metadata_content,
    metadata={"source": pdf_file, "page": "metadata", "type": "paper_metadata"}
)

# Split the rest of the document
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", ". ", " ", ""]
)
all_splits = text_splitter.split_documents(docs)

# Add metadata document at the beginning
all_splits.insert(0, metadata_doc)

# Also add a duplicate at position 10 to ensure it's found
all_splits.insert(10, metadata_doc)

print(f"Total chunks: {len(all_splits)} (including metadata)")

In [None]:
# Create FAISS vector store
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(
    documents=all_splits,
    embedding=embeddings
)
print("FAISS vector store created successfully")

In [None]:
# Initialize LLM and prompt
llm = ChatOpenAI(model="gpt-4o-mini")
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a research assistant analyzing an academic paper. "
        "Use the provided CONTEXT to answer questions accurately. "
        "Pay special attention to sections marked as 'PAPER METADATA' for questions about "
        "title, authors, publication date, etc. "
        "For publication year questions, look for 'Publication Date' or 'Submission Date' in the metadata. "
        "If the answer is in the context, provide it. If not, say you cannot find it."
    ),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])

In [None]:
# Define RAG pipeline
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    """Enhanced retrieval that prioritizes metadata for certain questions"""
    question_lower = state["question"].lower()
    
    # For metadata questions, search for the metadata document
    metadata_keywords = ["author", "title", "year", "published", "wrote", "when", "date"]
    if any(keyword in question_lower for keyword in metadata_keywords):
        # Search specifically for metadata
        docs = vector_store.similarity_search("PAPER METADATA authors title publication date", k=15)
        # Filter to prioritize metadata documents
        metadata_docs = [doc for doc in docs if "PAPER METADATA" in doc.page_content]
        other_docs = [doc for doc in docs if "PAPER METADATA" not in doc.page_content]
        docs = metadata_docs + other_docs[:5]  # Ensure metadata docs come first
    else:
        docs = vector_store.similarity_search(state["question"], k=6)
    
    return {"context": docs[:8]}

def generate(state: State):
    """Generate answer from context"""
    print("\n--- Retrieved Context Chunks ---\n")
    for i, doc in enumerate(state["context"]):
        snippet = doc.page_content[:300].replace("\n", " ")
        doc_type = doc.metadata.get('type', 'content')
        print(f"[Chunk {i+1} - Type: {doc_type}]\n{snippet}...\n---\n")
    
    context_text = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": context_text})
    response = llm.invoke(messages)
    return {"answer": response.content}

# Build the graph
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()
print("RAG pipeline ready")

In [None]:
# Test with key questions
test_questions = [
    "What is the title of this paper?",
    "Who are the authors of this paper?",
    "In which year was this paper published?",
    "When was this paper submitted?",
    "What institutions are the authors from?",
    "What are the main keywords of this paper?"
]

for question in test_questions:
    print(f"\n{'='*60}")
    print(f"Question: {question}")
    print(f"{'='*60}")
    
    result = graph.invoke({"question": question})
    print(f"\nAnswer: {result['answer']}")

In [None]:
# Interactive query
user_question = input("Enter your question about the document: ")
result = graph.invoke({"question": user_question})
print(f"\nQuestion: {user_question}")
print(f"\nAnswer: {result['answer']}")