https://python.langchain.com/docs/tutorials/rag/

In [1]:
import os
from dotenv import load_dotenv
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_google_community import BigQueryVectorStore
import csv
from langchain_community.document_loaders import WebBaseLoader
import bs4
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chat_models import init_chat_model
from langchain_core.tools import tool
from langgraph.graph import MessagesState, StateGraph, END
from langgraph.prebuilt import ToolNode, tools_condition
from IPython.display import Image
from langchain_core.messages import SystemMessage

In [2]:
load_dotenv()
assert "LANGSMITH_TRACING" in os.environ, "Please set the LANGSMITH_TRACING environment variable."
assert "LANGSMITH_API_KEY" in os.environ, "Please set the LANGSMITH_API_KEY environment variable."
assert "PROJECT_ID" in os.environ, "Please set the PROJECT_ID environment variable."
assert "LOCATION" in os.environ, "Please set the LOCATION environment variable."
assert "DATASET" in os.environ, "Please set the DATASET environment variable."
assert "TABLE" in os.environ, "Please set the TABLE environment variable."
PROJECT_ID = os.getenv("PROJECT_ID") 
LOCATION = os.getenv("LOCATION") 
DATASET = os.getenv("DATASET") 
TABLE = os.getenv("TABLE") 

In [3]:
PROJECT_ID = "llm-studies"
LOCATION = "us-central1"
DATASET = "blog_embeddings"
TABLE = "rag_embeddings_s"

In [4]:
embeddings = VertexAIEmbeddings(model="textembedding-gecko@latest")

In [5]:
vector_store = BigQueryVectorStore(
    project_id=PROJECT_ID,
    dataset_name=DATASET,
    table_name=TABLE,
    location=LOCATION,
    embedding=embeddings,
)

BigQuery table llm-studies.blog_embeddings.rag_embeddings_s initialized/validated as persistent storage. Access via BigQuery console:
 https://console.cloud.google.com/bigquery?project=llm-studies&ws=!1m5!1m4!4m3!1sllm-studies!2sblog_embeddings!3srag_embeddings_s


In [43]:
url_list = []
with open('.//..//..//files//20250207-posts_list-red.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        url = str(row[0]).replace("http","https")
        url_list.append(url)
print(url_list[0])

https://www.reflexoesdofilosofo.blog.br/2025/02/o-problema-de-parmenides.html


In [10]:
urls = [
    "https://www.reflexoesdofilosofo.blog.br/2024/08/a-terceira-margem-do-rio.html",
#    "https://www.reflexoesdofilosofo.blog.br/2024/09/uma-teoria-da-mente.html",
#    "https://www.reflexoesdofilosofo.blog.br/2025/02/o-problema-de-parmenides.html",
]
print(urls[0])

https://www.reflexoesdofilosofo.blog.br/2024/08/a-terceira-margem-do-rio.html


In [11]:
loader = WebBaseLoader(
    web_paths=(urls),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post hentry uncustomized-post-template", "post-title entry-title", "post-header")
        )
    ),
)
docs = loader.load()
len(docs)

1

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)
len(all_splits)

13

In [14]:
_ = vector_store.add_documents(documents=all_splits)

In [15]:
llm = init_chat_model("gemini-2.0-flash-001", model_provider="google_vertexai")

In [16]:
@tool(response_format="content_and_artifact")
def retrieve(query: str):
    """Retrieve information related to a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [17]:
# Step 1: Generate an AIMessage that may include a tool-call to be sent.
def query_or_respond(state: MessagesState):
    """Generate tool call for retrieval or respond."""
    llm_with_tools = llm.bind_tools([retrieve])
    response = llm_with_tools.invoke(state["messages"])
    # MessagesState appends messages to state instead of overwriting
    return {"messages": [response]}

In [18]:
# Step 3: Generate a response using the retrieved content.
def generate(state: MessagesState):
    """Generate answer."""
    # Get generated ToolMessages
    recent_tool_messages = []
    for message in reversed(state["messages"]):
        if message.type == "tool":
            recent_tool_messages.append(message)
        else:
            break
    tool_messages = recent_tool_messages[::-1]

    # Format into prompt
    docs_content = "\n\n".join(doc.content for doc in tool_messages)
    system_message_content = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        f"{docs_content}"
    )
    conversation_messages = [
        message
        for message in state["messages"]
        if message.type in ("human", "system")
        or (message.type == "ai" and not message.tool_calls)
    ]
    prompt = [SystemMessage(system_message_content)] + conversation_messages

    # Run
    response = llm.invoke(prompt)
    return {"messages": [response]}

In [19]:
tools = ToolNode([retrieve])

In [20]:
graph_builder = StateGraph(MessagesState)
graph_builder.add_node(query_or_respond)
graph_builder.add_node(tools)
graph_builder.add_node(generate)

graph_builder.set_entry_point("query_or_respond")
graph_builder.add_conditional_edges(
    "query_or_respond",
    tools_condition,
    {END: END, "tools": "tools"},
)
graph_builder.add_edge("tools", "generate")
graph_builder.add_edge("generate", END)

graph = graph_builder.compile()

In [21]:
input_message = "me fale sobre a mente"

for step in graph.stream(
    {"messages": [{"role": "user", "content": input_message}, {"role": "system", "content": "Você é um tutor de filosofia e deve trazer as respostas baseadas na teorias encontradas nos textos, sejam de autores ou termos"}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


Você é um tutor de filosofia e deve trazer as respostas baseadas na teorias encontradas nos textos, sejam de autores ou termos
Tool Calls:
  retrieve (f94c8c2b-8438-43ec-ae06-ae86477ba666)
 Call ID: f94c8c2b-8438-43ec-ae06-ae86477ba666
  Args:
    query: mente na filosofia
Name: retrieve

Source: {'doc_id': '31d1c41043134682beff20f04797b1f8', 'source': 'https://www.reflexoesdofilosofo.blog.br/2024/08/a-terceira-margem-do-rio.html', 'score': 0.6674564372885906}
Content: [i] Com base na Introdução de A
mente segundo Dennett, de, João de Fernandes Teixeira. São Paulo: Editora
Perspectiva, 2008.


[ii] “Ocorre
que a posição cética, ao duvidar das afirmações e das coisas, pode colocar
nossa existência em risco. Ora, como podemos viver duvidando de tudo? A
resposta cética parece ser a de uma atitude filosófica: aceitamos as coisas da
vida ordinária e vivemos nos baseando nela, porém dentro de uma atitude
filosófica mantemos a dúvida.”. Em https://www.reflexoesdofilosofo.blog.br/2024/04/pesq

In [12]:
# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")