In [6]:
import getpass
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

os.environ["LANGCHAIN_TRACING_V2"] = "true"

load_dotenv()
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
if "LANGCHAIN_API_KEY" not in os.environ:
    os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter your LangChain API key: ")

llm = ChatOpenAI(model="gpt-4o-mini")

In [None]:
# 1: Load: load data with document loaders (using WebBaseLoader which uses BeutifulSoup)
import bs4
from langchain_community.document_loaders import WebBaseLoader

# only get relevant HTML elements
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer}, # only load specific HTML elements
)

# load the data
docs = loader.load()

print("loaded", len(docs), "document with length", len(docs[0].page_content))  # check the number of documents and number of characters in the first document
print(docs[0].page_content[:500])


USER_AGENT environment variable not set, consider setting it to identify your requests.


loaded 1 document with length 43047


      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In


In [None]:
# 2: Split: split documents into smaller chunks 
#           smaller chunks can fit into context window and easier to search over and pass to LLM
#            also for embedding and vector store

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        # size of each chunk
    chunk_overlap=200,      # overlap between chunks (to preserve context in case some information is split across chunks)
    add_start_index=True,   # store start index to each chunk
)
all_splits = text_splitter.split_documents(docs)
print("split into", len(all_splits), "chunks with length", len(all_splits[0].page_content))  # check the number of chunks and number of characters in the first chunk
print(all_splits[50].metadata)  

split into 63 chunks with length 969
{'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'start_index': 34990}
