In [None]:
from utils import load_api_key
API_KEY = load_api_key()

In [None]:
# !pip install -q langchain langchain_community langchain_chroma langchain_ollama beautifulsoup4
#!pip install langchain-openai tabulate

In [None]:
model_family = "openai"
# model_family = "ollama"

if model_family == "ollama":
    from langchain_ollama import ChatOllama, OllamaEmbeddings
    llm = ChatOllama(model="llama3.2")
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    
elif model_family == "openai":
    from langchain_openai import ChatOpenAI, OpenAIEmbeddings
    llm = ChatOpenAI(model="gpt-4o-mini")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
import bs4
from langchain.tools.retriever import create_retriever_tool
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader, TextLoader
#from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter

# RAG Agent with Web based information

In [None]:
# Ejemplo .txt
# document_loader = TextLoader("data.txt", encoding="utf-8")
# documents = document_loader.load()

# Ejemplo web
loader = WebBaseLoader(
    web_paths=(["https://lilianweng.github.io/posts/2023-06-23-agent/"]),
    bs_kwargs={
        "parse_only": bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    }
)
docs = loader.load()

In [None]:
print(docs[0].page_content[1:600])

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(docs)

# BORRAR BASE DE DATOS ANTES DE EJECUTAR LA SIGUIENTE LINEA
vectordb = Chroma.from_documents(documents=split_docs, embedding=embeddings, persist_directory="./../../data/4_rag_agent_llm_chroma")

In [None]:
#vectordb._collection.get(include=['embeddings'])["embeddings"][0]
#len(vectordb._collection.get(include=['embeddings'])["embeddings"][0])

In [None]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever

In [None]:
tool_blog = create_retriever_tool(
    retriever=retriever, 
    name="blog_article_retriever",
    description="Searches and returns excerpts about LLM Agents from the blog article",
)
tools = [tool_blog]

In [None]:
from langgraph.prebuilt import create_react_agent

agent_executor = create_react_agent(
    llm,
    tools=tools,
)   

In [None]:
response = agent_executor.invoke(
    {"messages": "¿Qué es un agente LLM? Dame la respuesta en español"},
)
for message in response['messages']:
    message.pretty_print()

# Añadir datos desde archivo local

In [None]:
response = agent_executor.invoke(
    {"messages": "¿Cuántos días de baja por enfermedad tienen derecho los empleados de Lumon Industries anualmente?"},
)
for message in response['messages']:
    message.pretty_print()

In [None]:
document_loader = TextLoader("../../data/lumon_data.txt", encoding="utf-8")
docs_txt = document_loader.load()

split_docs = text_splitter.split_documents(docs_txt)

vectordb = Chroma.from_documents(documents=split_docs, embedding=embeddings, persist_directory="./../../data/4_rag_agent_lumon_chroma")

In [None]:
retriever_lumon = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever_lumon

tool_lumon = create_retriever_tool(
    retriever=retriever_lumon, 
    name="Lumon_documentation",
    description="Searches and returns excerpts from the Lumon Industries internal documentation",
)
tools = [tool_blog, tool_lumon]

In [None]:
from langgraph.prebuilt import create_react_agent

agent_executor_rag = create_react_agent(
    llm,
    tools=tools,
)   

In [None]:
response = agent_executor_rag.invoke(
    {"messages": "¿Cuántos días de baja por enfermedad tienen derecho los empleados de Lumon Industries anualmente?"},
)
#for message in response['messages']:
#    message.pretty_print()

In [None]:
response['messages'][-1].content

In [None]:
queries = [
    "¿Cuántos días de baja por enfermedad tienen derecho los empleados de Lumon Industries anualmente?",
    "¿Cuál es la contraseña de la red de invitados de Lumon Industries?",
    "¿Cuánto dura la garantía de los productos de Lumon Industries?",
    "¿A quién debo contactar si tengo problemas con los productos de Lumon Industries?",
    "¿Cuál es el horario de atención al cliente de Lumon Industries? Devuelve solamente el dato, nada más.",
]

# Compare responses from both methods
results = []
for query in queries:
    rag_response = agent_executor_rag.invoke({"messages": query})['messages'][-1].content
    gpt4o_response = agent_executor.invoke({"messages": query})['messages'][-1].content
    results.append({"Query": query, "RAG Response": rag_response, "GPT-4o Response": gpt4o_response})

In [None]:
import textwrap
import pandas as pd
from tabulate import tabulate

# Convert results to a dataframe and display
df = pd.DataFrame(results)

# Function to wrap text in all string columns
def wrap_dataframe_text(df, width=40):
    wrapped_df = df.copy()
    for col in wrapped_df.select_dtypes(include=['object']).columns:
        wrapped_df[col] = wrapped_df[col].apply(lambda x: "\n".join(textwrap.wrap(x, width)) if isinstance(x, str) else x)
    return wrapped_df

# Apply text wrapping to all string columns
df_wrapped = wrap_dataframe_text(df, width=30)

# Display DataFrame as a formatted table
print(tabulate(df_wrapped, headers='keys', tablefmt='grid'))

In [None]:
response = agent_executor_rag.invoke(
    {"messages": "¿Qué es un agente LLM? Dame la respuesta en español"},
)
for message in response['messages']:
    message.pretty_print()

Proceso similar para excel, CSV, PDF, etc...