In [1]:
%pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph

Note: you may need to restart the kernel to use updated packages.


In [7]:

import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = ""


In [8]:
pip install -qU "langchain[openai]"

Note: you may need to restart the kernel to use updated packages.


In [10]:
import getpass
import os
from langchain.embeddings import OpenAIEmbeddings

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")



  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


In [3]:
pip install -qU langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp312-cp312-macosx_14_0_arm64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp312-cp312-macosx_14_0_arm64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Note: you may need to restart the kernel to use updated packages.


In [11]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

embedding_dim = len(embeddings.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim)

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [49]:

# === Ensure pypdf is installed ===
# Run this cell first to install the PDF parsing dependency:
!pip install pypdf

import os, requests
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

# 1. Download the research PDF
url = "https://arxiv.org/pdf/2507.13334.pdf"
response = requests.get(url)
pdf_file = "agent_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(response.content)

# 2. Load PDF with PyPDFLoader
loader = PyPDFLoader(pdf_file)
docs = loader.load()

# 3. Split into semantically coherent chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
all_splits = text_splitter.split_documents(docs)

# 4. Build FAISS vector store with OpenAI embeddings
os.environ["OPENAI_API_KEY"] = ""  # replace with your key
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(all_splits, embeddings)

# 5. Initialize the LLM and strict RAG prompt
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a research assistant. ONLY use the provided CONTEXT to answer the QUESTION. "
        "If the answer cannot be found in the CONTEXT, reply: 'I could not find the answer in the document.'"
    ),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])

# 6. Define the LangGraph state
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# 7. Retrieval node
def retrieve(state: State):
    docs = vector_store.similarity_search(state["question"], k=6)
    return {"context": docs}

# 8. Generation node (with context logging)
def generate(state: State):
    print("\n--- Retrieved Context Chunks ---\n")
    for i, doc in enumerate(state["context"]):
        print(f"[Chunk {i+1}]\n{doc.page_content[:300]}...\n---\n")
    context_text = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": context_text})
    response = llm.invoke(messages)
    return {"answer": response.content}

# 9. Compile and test the LangGraph
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

# 10. Example query
input_state = {"question": "What is the title of this research paper?"}
result = graph.invoke(input_state)
print("\n--- Answer ---\n", result["answer"])




--- Retrieved Context Chunks ---

[Chunk 1]
& Challenges (§7)
Foundational
Research (§7.1)
e.g.,Theoretical Foundations [1132], Scaling Laws [731], O(n2) Computational Challenges [295],
Multi-modal Integration [476], Compositional Understanding [835], Context Optimization [663],
Frameworks for Multi-agent Coordination [128], Information-theor...
---

[Chunk 2]
[906] Alex Roxin and Stefano Fusi. Efficient partitioning of memory systems and its importance for memory
consolidation. PLoS Comput. Biol., 2013.
[907] Kaushik Roy, Yuxin Zi, Vignesh Narayanan, Manas Gaur, and Amit P. Sheth. Knowledge-infused self
attention transformers, arXiv preprint arXiv:2306.1...
---

[Chunk 3]
[562] L. Krupp, Daniel Geissler, P. Lukowicz, and Jakob Karolus. Towards sustainable web agents: A plea
for transparency and dedicated metrics for energy consumption, arXiv preprint arXiv:2502.17903,
2025. URLhttps://arxiv.org/abs/2502.17903v1.
[563] M. Kuhail, Jose Berengueres, Fatma Taher, Sana Z....
---

[Chunk 4

In [50]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 1. Check embedding dimensions for a few sample texts
sample_texts = [
    "The agent uses a planning loop for decision making.",
    "This is an unrelated sentence about bananas."
]
# embed_documents returns a list of vectors
sample_vecs = embeddings.embed_documents(sample_texts)
print("Number of vectors:", len(sample_vecs))
print("Dim of each vector:", len(sample_vecs[0]))  # e.g. 1536

# 2. Check L2 norms (if normalized, these should be ~1.0)
for i, vec in enumerate(sample_vecs):
    norm = np.linalg.norm(vec)
    print(f"Vector {i+1} L2 norm: {norm:.4f}")

# 3. Cosine similarity: similar vs. dissimilar
a = embeddings.embed_query("planning loop in agent architecture")
b = embeddings.embed_query("multi-step reasoning mechanism")
c = embeddings.embed_query("the color of a banana")

sim_ab = cosine_similarity([a], [b])[0][0]
sim_ac = cosine_similarity([a], [c])[0][0]
print(f"\nCosine(similar pair): {sim_ab:.3f}")
print(f"Cosine(dissimilar pair): {sim_ac:.3f}")

# 4. Sanity-check retrieval on your real index
query = "What dataset is used in the experiments?"
hits = vector_store.similarity_search(query, k=4)
print("\nTop 4 chunks for your query:")
for i, doc in enumerate(hits, start=1):
    snippet = doc.page_content[:200].replace("\n", " ")
    print(f"\n— Hit {i} —\n{snippet}…")

# If Hit #1–4 actually mention the dataset as in the paper, 
# your embeddings + FAISS retrieval are working correctly.


Number of vectors: 2
Dim of each vector: 1536
Vector 1 L2 norm: 1.0000
Vector 2 L2 norm: 1.0000

Cosine(similar pair): 0.298
Cosine(dissimilar pair): 0.009

Top 4 chunks for your query:

— Hit 1 —
[124] Daniel Casanueva-Morato, A. Ayuso-Martinez, J. P. Dominguez-Morales, A. Jiménez-Fernandez, and G. Jiménez-Moreno. A bio-inspired implementation of a sparse-learning spike-based hippocampus memor…

— Hit 2 —
survey of methods and datasets, arXiv preprint arXiv:2504.20119, 2025. URLhttps://arxiv. org/abs/2504.20119v2. [96] R. Breil, D. Delahaye, Laurent Lapasset, and E. Feron. Multi-agent systems to help m…

— Hit 3 —
AutoGen [1158], MetaGPT [408], CAMEL [600], CrewAI [184], Swarm Agent [808], 3S orchestrator [893], SagaLLM [128], Communication Protocols [1210], Orchestration [894], Coordination Strategies [625], A…

— Hit 4 —
Release Date Open Source Method / Model Success Rate (%) Source 2025-02 × IBM CUGA 61.7 [747] 2025-01 × OpenAI Operator 58.1 [807] 2024-08 × Jace.AI 57.1 [470] 2024

In [57]:

# === Install dependencies ===
# Run once to install PDF parsing and embedding dependencies:


import os
import requests
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

# === 1. Download the research paper PDF ===
url = "https://arxiv.org/pdf/2507.13334.pdf"
response = requests.get(url)
pdf_file = "agent_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(response.content)

# === 2. Load PDF with PyPDFLoader ===
loader = PyPDFLoader(pdf_file)
docs = loader.load()

# === 3. Split into semantically coherent chunks ===
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
all_splits = text_splitter.split_documents(docs)

# === 4. Build FAISS vector store with All-MiniLM-L6-v2 embeddings ===
# Local sentence-transformer model via HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(all_splits, embeddings)

# === 5. Initialize the LLM and strict RAG prompt ===
# Ensure OPENAI_API_KEY is set for ChatOpenAI
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a research assistant. ONLY use the provided CONTEXT to answer the QUESTION. "
        "If the answer cannot be found in the CONTEXT, reply: 'I could not find the answer in the document.'"
    ),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])

# === 6. Define the LangGraph state ===
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# === 7. Retrieval node ===
def retrieve(state: State):
    docs = vector_store.similarity_search(state["question"], k=6)
    return {"context": docs}

# === 8. Generation node with context logging ===
def generate(state: State):
    print("\n--- Retrieved Context Chunks ---\n")
    for i, doc in enumerate(state["context"]):
        print(f"[Chunk {i+1}]\n{doc.page_content[:300]}...\n---\n")
    context_text = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": context_text})
    response = llm.invoke(messages)
    return {"answer": response.content}

# === 9. Compile and test the LangGraph ===
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

# === 10. Example query ===
input_state = {"question": "What dataset is used in the experiments?"}
result = graph.invoke(input_state)
print("\n--- Answer ---\n", result["answer"])





ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [4]:



import os
# Disable TensorFlow in Hugging Face Transformers to avoid tf-keras issues
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"
# Set your OpenAI API key for ChatOpenAI
os.environ["OPENAI_API_KEY"] = ""  # Replace with your actual OpenAI key

import requests
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

# === 1. Download the research paper PDF ===
url = "https://arxiv.org/pdf/2507.13334.pdf"
response = requests.get(url)
pdf_file = "agent_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(response.content)

# === 2. Load PDF with PyPDFLoader ===
loader = PyPDFLoader(pdf_file)
docs = loader.load()

# === 3. Split into semantically coherent chunks ===
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
all_splits = text_splitter.split_documents(docs)

# === 4. Build FAISS vector store with All-MiniLM-L6-v2 embeddings ===
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(all_splits, embeddings)

# === 5. Initialize the LLM and strict RAG prompt ===
# ChatOpenAI will read OPENAI_API_KEY from environment
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a research assistant. ONLY use the provided CONTEXT to answer the QUESTION. "
        "If the answer cannot be found in the CONTEXT, reply: 'I could not find the answer in the document.'"
    ),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])

# === 6. Define the LangGraph state ===
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# === 7. Retrieval node ===
def retrieve(state: State):
    docs = vector_store.similarity_search(state["question"], k=6)
    return {"context": docs}

# === 8. Generation node with context logging ===
def generate(state: State):
    print("\n--- Retrieved Context Chunks ---\n")
    for i, doc in enumerate(state["context"]):
        print(f"[Chunk {i+1}]\n{doc.page_content[:300]}...\n---\n")
    context_text = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": context_text})
    response = llm.invoke(messages)
    return {"answer": response.content}

# === 9. Compile and test the LangGraph ===
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

# === 10. Example query ===
input_state = {"question": "What is the title of this research paper?"}
result = graph.invoke(input_state)
print("\n--- Answer ---\n", result["answer"])




--- Retrieved Context Chunks ---

[Chunk 1]
Organizing your related work using bibsonomy & llms.Conference on Human Information Interaction
and Retrieval, 2024.
[1068] D. Walton. Using argumentation schemes to find motives and intentions of a rational agent.Argument
Comput., 2020.
[1069] Hanlong Wan, Jian Zhang, Yan Chen, Weili Xu, and Fan Fe...
---

[Chunk 2]
2503.01203v1.
[275] ChrisanthaFernando, DylanBanarse, H.Michalewski, SimonOsindero, andTimRocktäschel. Prompt-
breeder: Self-referential self-improvement via prompt evolution.International Conference on Machine
Learning, 2023.
79...
---

[Chunk 3]
tool-augmented llms. 2023. URLhttps://arxiv.org/abs/2304.08244.
[613] Michelle M. Li, Ben Y. Reis, Adam Rodman, Tianxi Cai, Noa Dagan, Ran D. Balicer, Joseph Loscalzo,
Isaac S. Kohane, and M. Zitnik. One patient, many contexts: Scaling medical ai through contextual
intelligence, arXiv preprint arXiv...
---

[Chunk 4]
2023.
[794] L. Nadel, Jessica D. Payne, and W. J. Jacobs. The relatio

In [11]:

import os
# Disable TensorFlow in HF Transformers
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"
# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = ""  # Replace with your actual key

import requests
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

# === 1. Download the research paper PDF ===
url = "https://arxiv.org/pdf/2507.13334.pdf"
response = requests.get(url)
pdf_file = "agent_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(response.content)

# === 2. Load PDF with UnstructuredPDFLoader (elements mode) ===
loader = UnstructuredPDFLoader(pdf_file, mode="elements")
docs = loader.load()
print(f"Loaded {len(docs)} document elements from PDF.")

# === 3. Split into semantically coherent chunks ===
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", " "]
)
all_splits = text_splitter.split_documents(docs)
print(f"Split into {len(all_splits)} chunks.")

# === DEBUG: Confirm Experiments Section Extraction ===
experiment_chunks = [doc for doc in all_splits if "experiment" in doc.page_content.lower()]
print(f"Found {len(experiment_chunks)} chunks containing 'experiment'.")
for i, doc in enumerate(experiment_chunks[:3], 1):
    snippet = doc.page_content[:300].replace("\n", " ")
    print(f"--- Experiment Chunk {i} ---\n{snippet}...\n")

# === 4. Build FAISS vector store with All-MiniLM-L6-v2 embeddings ===
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(all_splits, embeddings)
print("FAISS index built.")

# === 5. Initialize the LLM and strict RAG prompt ===
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a research assistant. ONLY use the provided CONTEXT to answer the QUESTION. "
        "If the answer cannot be found in the CONTEXT, reply: 'I could not find the answer in the document.'"
    ),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])

# === 6. Define the LangGraph state ===
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# === 7. Retrieval node ===
def retrieve(state: State):
    docs = vector_store.similarity_search(state["question"], k=6)
    print(f"Retrieved {len(docs)} docs for question: {state['question']}")
    return {"context": docs}

# === 8. Generation node with context logging ===
def generate(state: State):
    print("
--- Retrieved Context Chunks ---
")
    for i, doc in enumerate(state["context"], 1):
        # get first 200 chars and replace newlines
        snippet = doc.page_content[:200].replace("
", " ")
        print(f"[Chunk {i}]
{snippet}...
---
")
    # Join full content for prompt
    context_text = "

".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": context_text})
    response = llm.invoke(messages)
    return {"answer": response.content}

# === 9. Compile and test the LangGraph === Compile and test the LangGraph ===
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

# === 10. Example query ===
input_state = {"question": "What dataset is used in the experiments?"}
result = graph.invoke(input_state)
print("\n--- Final Answer ---\n", result["answer"])



SyntaxError: unterminated string literal (detected at line 77) (2520075735.py, line 77)

In [7]:


import os
# Disable TensorFlow in HF Transformers
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"
# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = ""  # Replace with your actual key

import requests
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

# === 1. Download the research paper PDF ===
url = "https://arxiv.org/pdf/2507.13334.pdf"
response = requests.get(url)
pdf_file = "agent_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(response.content)

# === 2. Load PDF with UnstructuredPDFLoader (elements mode) ===
loader = UnstructuredPDFLoader(pdf_file, mode="elements")
docs = loader.load()
print(f"Loaded {len(docs)} document elements from PDF.")

# === 3. Split into semantically coherent chunks ===
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", " "]
)
all_splits = text_splitter.split_documents(docs)
print(f"Split into {len(all_splits)} chunks.")

# === DEBUG: Confirm Experiments Section Extraction ===
experiment_chunks = [doc for doc in all_splits if "experiment" in doc.page_content.lower()]
print(f"Found {len(experiment_chunks)} chunks containing 'experiment'.")
for i, doc in enumerate(experiment_chunks[:3], 1):
    snippet = doc.page_content[:300].replace("\n", " ")
    print(f"--- Experiment Chunk {i} ---\n{snippet}...\n")

# === 4. Build FAISS vector store with All-MiniLM-L6-v2 embeddings ===
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(all_splits, embeddings)
print("FAISS index built.")

# === 5. Initialize the LLM and strict RAG prompt ===
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a research assistant. ONLY use the provided CONTEXT to answer the QUESTION. "
        "If the answer cannot be found in the CONTEXT, reply: 'I could not find the answer in the document.'"
    ),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])

# === 6. Define the LangGraph state ===
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# === 7. Retrieval node ===
def retrieve(state: State):
    docs = vector_store.similarity_search(state["question"], k=6)
    print(f"Retrieved {len(docs)} docs for question: {state['question']}")
    return {"context": docs}

# === 8. Generation node with context logging ===
def generate(state: State):
    # Debug: indicate start of context output
    print("\n--- Retrieved Context Chunks ---\n")
    
    # Print each retrieved chunk snippet
    for i, doc in enumerate(state["context"], 1):
        # replace literal newlines in the snippet
        snippet = doc.page_content[:200].replace("\n", " ")
        print(f"[Chunk {i}]\n{snippet}...\n---\n")
    
    # Join full content for prompt with double newlines between chunks
    context_text = "\n\n".join(doc.page_content for doc in state["context"])
    
    # Invoke prompt with context and question
    messages = prompt.invoke({"question": state["question"], "context": context_text})
    response = llm.invoke(messages)
    return {"answer": response.content}

# === 9. Compile and test the LangGraph === Compile and test the LangGraph === Compile and test the LangGraph ===
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

# === 10. Example query ===
input_state = {"question": "What dataset is used in the experiments?"}
result = graph.invoke(input_state)
print("\n--- Final Answer ---\n", result["answer"])



  if dtype.type == np.bool:


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [26]:
import os
from pypdf import PdfReader
import requests
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

import numpy as np
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores.faiss import FAISS

# === 0. Set environment variables ===
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"
os.environ["OPENAI_API_KEY"] = ""

# === 1. Download the PDF ===
url = "https://arxiv.org/pdf/2506.02153"
response = requests.get(url)
pdf_file = "SLM_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(response.content)

# === 2. Extract PDF metadata ===
reader = PdfReader(pdf_file)
meta = reader.metadata
title = meta.get('/Title', '') or ''
authors = meta.get('/Author', '') or ''

# === 3. Load PDF pages ===
loader = PyPDFLoader(pdf_file)
docs = loader.load()
print(f"Loaded {len(docs)} pages from PDF")

# === 4. Attach metadata to pages ===
for doc in docs:
    doc.metadata["title"] = title
    doc.metadata["authors"] = authors
    doc.metadata["type"] = "Text"

# === 5. Split documents ===
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
splits = splitter.split_documents(docs)
print(f"Prepared {len(splits)} chunks")

if not splits:
    raise ValueError("No document chunks found to embed. Check PDF content or splitting logic.")

# === 6. Build FAISS vector store manually with OpenAI embeddings ===
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")
texts = [doc.page_content for doc in splits]
metadatas = [doc.metadata for doc in splits]

# 6a) get raw embeddings
raw_embeddings = embed_model.embed_documents(texts)

# 6b) unwrap any dict wrappers and collect pure vectors
emb_list = []
for emb in raw_embeddings:
    if isinstance(emb, dict) and "embedding" in emb:
        emb_list.append(emb["embedding"])
    elif isinstance(emb, (list, tuple, np.ndarray)):
        emb_list.append(emb)
    else:
        raise ValueError(f"Unexpected embedding format: {type(emb)}")

# 6c) stack into a 2-D float32 array & force C‐contiguity
embeddings_np = np.stack(emb_list).astype(np.float32)
embeddings_np = np.ascontiguousarray(embeddings_np, dtype=np.float32)

# (optional) sanity‐check:
print(
    "type:", type(embeddings_np),
    "shape:", embeddings_np.shape,
    "dtype:", embeddings_np.dtype,
    "C_CONTIGUOUS:", embeddings_np.flags["C_CONTIGUOUS"]
)

# 6d) now build & add to FAISS
d = embeddings_np.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings_np)

# Build an in-memory docstore mapping string IDs to Document objects
doc_ids = [str(i) for i in range(len(splits))]
docstore = InMemoryDocstore({doc_id: doc for doc_id, doc in zip(doc_ids, splits)})

# Construct LangChain FAISS wrapper
vector_store = FAISS(
    index,
    docstore,
    index_to_docstore_id={i: doc_id for i, doc_id in enumerate(doc_ids)},
    embedding=embed_model
)
print("Built FAISS index manually with OpenAI embeddings and metadata")

# === 7. Setup retrieval-augmented generation ===
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a research assistant. ONLY use the provided CONTEXT to answer the QUESTION. "
     "If the answer cannot be found in the CONTEXT, reply: 'I could not find the answer in the document.'"),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# Retrieval node
def retrieve(state: State):
    hits = vector_store.similarity_search(state['question'], k=6)
    return {'context': hits}

# Generation node
def generate(state: State):
    print("\n--- Retrieved Context Chunks ---\n")
    for idx, doc in enumerate(state['context'], 1):
        snippet = doc.page_content[:200].replace("\n", " ")
        print(f"[Chunk {idx}] (type={doc.metadata.get('type', '')})\n{snippet}...\n---\n")
    context_text = "\n\n".join(d.page_content for d in state['context'])
    messages = prompt.invoke({
        'question': state['question'],
        'context': context_text
    })
    resp = llm.invoke(messages)
    return {'answer': resp.content}

# === 8. Compile and run the LangGraph ===
builder = StateGraph(State).add_sequence([retrieve, generate])
builder.add_edge(START, 'retrieve')
graph = builder.compile()

# Example query
test = {'question': 'What dataset is used in the experiments?'}
out = graph.invoke(test)
print("\n--- Final Answer ---\n", out['answer'])


Loaded 17 pages from PDF
Prepared 94 chunks
type: <class 'numpy.ndarray'> shape: (94, 1536) dtype: float32 C_CONTIGUOUS: True


ValueError: input not a numpy array

In [24]:
import os
from pypdf import PdfReader
import requests
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

# For manual FAISS handling
import numpy as np
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores.faiss import FAISS

# === 0. Set environment variables ===
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"
os.environ["OPENAI_API_KEY"] = ""

# === 1. Download the PDF ===
url = "https://arxiv.org/pdf/2506.02153"
response = requests.get(url)
pdf_file = "SLM_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(response.content)

# === 2. Extract PDF metadata ===
reader = PdfReader(pdf_file)
meta = reader.metadata
title = meta.get('/Title', '') or ''
authors = meta.get('/Author', '') or ''

# === 3. Load PDF pages ===
loader = PyPDFLoader(pdf_file)
docs = loader.load()
print(f"Loaded {len(docs)} pages from PDF")

# === 4. Attach metadata to pages ===
for doc in docs:
    doc.metadata["title"] = title
    doc.metadata["authors"] = authors
    doc.metadata["type"] = "Text"

# === 5. Split documents ===
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
splits = splitter.split_documents(docs)
print(f"Prepared {len(splits)} chunks")

if not splits:
    raise ValueError("No document chunks found to embed. Check PDF content or splitting logic.")

# === 6. Build FAISS vector store manually with OpenAI embeddings ===
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")
texts = [doc.page_content for doc in splits]
metadatas = [doc.metadata for doc in splits]
# Generate embeddings for each chunk
embeddings = embed_model.embed_documents(texts)
# Convert to numpy array of shape (n_chunks, dim)
embeddings_np = np.array(embeddings, dtype=np.float32)

# Create FAISS index (L2 distance)
d = embeddings_np.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings_np)

# Build an in-memory docstore mapping string IDs to Document objects
doc_ids = [str(i) for i in range(len(splits))]
docstore = InMemoryDocstore({doc_id: doc for doc_id, doc in zip(doc_ids, splits)})

# Construct LangChain FAISS wrapper
vector_store = FAISS(
    index, 
    docstore, 
    index_to_docstore_id={i: doc_id for i, doc_id in enumerate(doc_ids)},
    embedding=embed_model
)
print("Built FAISS index manually with OpenAI embeddings and metadata")

# === 7. Setup retrieval-augmented generation ===
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are a research assistant. ONLY use the provided CONTEXT to answer the QUESTION. "
     "If the answer cannot be found in the CONTEXT, reply: 'I could not find the answer in the document.'"),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# Retrieval node
def retrieve(state: State):
    hits = vector_store.similarity_search(state['question'], k=6)
    return {'context': hits}

# Generation node
def generate(state: State):
    print("\n--- Retrieved Context Chunks ---\n")
    for idx, doc in enumerate(state['context'], 1):
        snippet = doc.page_content[:200].replace("\n", " ")
        print(f"[Chunk {idx}] (type={doc.metadata.get('type', '')})\n{snippet}...\n---\n")
    context_text = "\n\n".join(d.page_content for d in state['context'])
    messages = prompt.invoke({
        'question': state['question'],
        'context': context_text
    })
    resp = llm.invoke(messages)
    return {'answer': resp.content}

# === 8. Compile and run the LangGraph ===
builder = StateGraph(State).add_sequence([retrieve, generate])
builder.add_edge(START, 'retrieve')
graph = builder.compile()

# Example query
test = {'question': 'What dataset is used in the experiments?'}
out = graph.invoke(test)
print("\n--- Final Answer ---\n", out['answer'])


Loaded 17 pages from PDF
Prepared 94 chunks


ValueError: input not a numpy array

In [29]:
import os
from pypdf import PdfReader
import requests
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

import numpy as np
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores.faiss import FAISS

# === 0. Env vars ===
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"
os.environ["OPENAI_API_KEY"] = ""

# === 1. Download & save PDF ===
url = "https://arxiv.org/pdf/2506.02153"
pdf_file = "SLM_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(requests.get(url).content)

# === 2. Read metadata & pages ===
reader = PdfReader(pdf_file)
meta = reader.metadata
title, authors = meta.get('/Title','') or '', meta.get('/Author','') or ''

loader = PyPDFLoader(pdf_file)
docs = loader.load()
print(f"Loaded {len(docs)} pages")

# === 3. Tag & split ===
for d in docs:
    d.metadata.update({"title": title, "authors": authors, "type": "Text"})

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
splits = splitter.split_documents(docs)
print(f"→ {len(splits)} text chunks")

# === 4. Embed & build FAISS ===
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")
texts     = [d.page_content for d in splits]

# 4a) raw embeddings
raw_embeddings = embed_model.embed_documents(texts)

# 4b) unwrap any dicts
emb_list = []
for i, emb in enumerate(raw_embeddings):
    if isinstance(emb, dict) and "embedding" in emb:
        emb_list.append(emb["embedding"])
    elif isinstance(emb, (list, tuple, np.ndarray)):
        emb_list.append(emb)
    else:
        raise ValueError(f"[#{i}] Unexpected embedding type: {type(emb)}")

# 4c) stack + force C-contiguity
embeddings_np = np.stack(emb_list).astype(np.float32)
embeddings_np = np.ascontiguousarray(embeddings_np, dtype=np.float32)

# 4d) DEBUG: inspect exactly what FAISS will see
print(">>> embeddings_np type:", type(embeddings_np))
print(">>> shape:", embeddings_np.shape)
print(">>> dtype:", embeddings_np.dtype)
print(">>> C_CONTIGUOUS:", embeddings_np.flags["C_CONTIGUOUS"])

# 4e) build & add to FAISS
d     = embeddings_np.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings_np)    # should now accept a real numpy array

# 4f) wrap in LangChain FAISS
ids     = [str(i) for i in range(len(splits))]
docstore = InMemoryDocstore({i: doc for i, doc in zip(ids, splits)})
vector_store = FAISS(
    index,
    docstore,
    index_to_docstore_id={i: id for i, id in enumerate(ids)},
    embedding=embed_model
)
print("✅ FAISS index built")

# === 5. RAG setup & test ===
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a research assistant. Use ONLY the CONTEXT to answer."),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])
class State(TypedDict):
    question: str
    context:   List[Document]
    answer:    str

def retrieve(state: State):
    return {"context": vector_store.similarity_search(state["question"], k=6)}

def generate(state: State):
    ctx = "\n\n".join(d.page_content for d in state["context"])
    msgs = prompt.invoke({"context": ctx, "question": state["question"]})
    return {"answer": llm.invoke(msgs).content}

from langgraph.graph import START
builder = StateGraph(State).add_sequence([retrieve, generate])
builder.add_edge(START, "retrieve")
graph = builder.compile()

# Quick test
out = graph.invoke({"question": "What dataset is used in the experiments?"})
print("\nFinal Answer:\n", out["answer"])


Loaded 17 pages
→ 94 text chunks
>>> embeddings_np type: <class 'numpy.ndarray'>
>>> shape: (94, 1536)
>>> dtype: float32
>>> C_CONTIGUOUS: True


ValueError: input not a numpy array

In [30]:
import os
import requests
from pypdf import PdfReader
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

# === 0. Env vars ===
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"
os.environ["OPENAI_API_KEY"] = ""

# === 1. Download & load PDF ===
url = "https://arxiv.org/pdf/2506.02153"
pdf_file = "SLM_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(requests.get(url).content)

reader = PdfReader(pdf_file)
meta = reader.metadata
title, authors = meta.get('/Title','') or '', meta.get('/Author','') or ''

loader = PyPDFLoader(pdf_file)
docs = loader.load()
print(f"Loaded {len(docs)} pages")

# === 2. Tag & split into chunks ===
for d in docs:
    d.metadata.update({"title": title, "authors": authors, "type": "Text"})

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
splits = splitter.split_documents(docs)
print(f"Prepared {len(splits)} chunks")

# === 3. Embed & build FAISS via LangChain helper ===
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# This one‐liner does everything: encoding, numpy‐array conversion, FAISS index + docstore wiring
vector_store = FAISS.from_documents(
    documents=splits,
    embedding=embed_model,
    metadatas=[d.metadata for d in splits],
    index_factory_str="Flat"          # “Flat” = IndexFlatL2
)

print("✅ Built FAISS index via LangChain")

# === 4. Setup RAG (retrieval‐augmented generation) ===
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a research assistant. ONLY use the provided CONTEXT to answer. "
     "If it’s not in the CONTEXT, reply: 'I could not find the answer in the document.'"),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    hits = vector_store.similarity_search(state["question"], k=6)
    return {"context": hits}

def generate(state: State):
    # print out what we retrieved
    print("\n--- Retrieved Context Chunks ---\n")
    for i, doc in enumerate(state["context"], 1):
        snippet = doc.page_content[:200].replace("\n", " ")
        print(f"[Chunk {i}] {snippet}...\n---\n")

    context_text = "\n\n".join(d.page_content for d in state["context"])
    messages = prompt.invoke({
        "context": context_text,
        "question": state["question"]
    })
    answer = llm.invoke(messages).content
    return {"answer": answer}

# === 5. Wire up the LangGraph and run a test query ===
builder = StateGraph(State).add_sequence([retrieve, generate])
builder.add_edge(START, "retrieve")
graph = builder.compile()

out = graph.invoke({"question": "What dataset is used in the experiments?"})
print("\n--- Final Answer ---\n", out["answer"])


Loaded 17 pages
Prepared 94 chunks


TypeError: langchain_community.vectorstores.faiss.FAISS.from_texts() got multiple values for keyword argument 'metadatas'

In [31]:
import os
import requests
from pypdf import PdfReader
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.documents import Document
from langchain_community.vectorstores.faiss import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict, List

# === Setup ===
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"
os.environ["OPENAI_API_KEY"] = ""

# 1. Download & load PDF
url = "https://arxiv.org/pdf/2506.02153"
pdf_file = "SLM_research_paper.pdf"
with open(pdf_file, "wb") as f:
    f.write(requests.get(url).content)

reader = PdfReader(pdf_file)
meta = reader.metadata
title, authors = meta.get('/Title','') or '', meta.get('/Author','') or ''

loader = PyPDFLoader(pdf_file)
docs = loader.load()
print(f"Loaded {len(docs)} pages")

# 2. Tag & split
for d in docs:
    d.metadata.update({"title": title, "authors": authors, "type": "Text"})

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
splits = splitter.split_documents(docs)
print(f"Prepared {len(splits)} chunks")

# 3. Build FAISS via from_texts
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

vector_store = FAISS.from_texts(
    texts=[doc.page_content for doc in splits],
    embedding=embed_model,
    metadatas=[doc.metadata for doc in splits],
    ids=[str(i) for i in range(len(splits))],
    normalize_L2=False,
    distance_strategy=None,        # omit or set e.g. DistanceStrategy.EUCLIDEAN_DISTANCE
)
print("✅ Built FAISS index via from_texts")

# 4. RAG setup + test (same as before)…
llm = ChatOpenAI(model_name="gpt-4o")
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a research assistant. ONLY use the CONTEXT."),
    ("human", "CONTEXT:\n{context}\n\nQUESTION: {question}")
])
class State(TypedDict):
    question: str
    context:   List[Document]
    answer:    str

def retrieve(state: State):
    return {"context": vector_store.similarity_search(state["question"], k=6)}

def generate(state: State):
    ctx = "\n\n".join(d.page_content for d in state["context"])
    msgs = prompt.invoke({"context": ctx, "question": state["question"]})
    return {"answer": llm.invoke(msgs).content}

builder = StateGraph(State).add_sequence([retrieve, generate])
builder.add_edge(START, "retrieve")
graph = builder.compile()

out = graph.invoke({"question": "What dataset is used in the experiments?"})
print("\n--- Final Answer ---\n", out["answer"])


Loaded 17 pages
Prepared 94 chunks


ValueError: input not a numpy array