In [32]:
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai chromadb bs4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [33]:
import bs4

from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

## Load

In [52]:
import requests
from bs4 import BeautifulSoup

url = "http://sonnet.io/blog"

response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

article_links = soup.find_all("a", class_="listing__post-link")
relative_urls = [link["href"] for link in article_links if link["href"].startswith("/")]

urls = [f"http://sonnet.io{relative_url}" for relative_url in relative_urls]
urls

['http://sonnet.io/posts/face/',
 'http://sonnet.io/posts/hummingbirds/',
 'http://sonnet.io/posts/wip/',
 'http://sonnet.io/posts/sit/',
 'http://sonnet.io/posts/emotive-conjugation/',
 'http://sonnet.io/posts/use-rainbow/',
 'http://sonnet.io/posts/starfish/',
 'http://sonnet.io/posts/hot-air-balloon/',
 'http://sonnet.io/posts/code-sober-debug-drunk/',
 'http://sonnet.io/posts/snakes/',
 'http://sonnet.io/posts/reactive-hole/',
 'http://sonnet.io/posts/hi/',
 'http://sonnet.io/posts/ulysses/']

In [53]:
loader = WebBaseLoader(
    web_paths=urls,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post__content", "post__title")
        )
    ),
)

docs = loader.load()

print(f"{len(docs)} documents loaded")
print(docs[0])


13 documents loaded
page_content="***I remember the first time I drew a face. It looked like this:I was sitting on the slope of a hill overlooking my house, together with my older cousin. It was a late summer afternoon: still warm, with the grass still golden, but slowly turning red, the air smelling like honey and beeswax. He drew first. I followed.That's how I draw faces now:I also remember the first letter I ever wrote! It looked like this:I drew it on a wooden plank with a flat red carpenter's pencil. The shape of the lead and the texture of the wood made it hard to draw curves. You had to go in straight lines and press just a little bit so the lead didn't sink into the pulp.I drew it in the room where all of us slept and watched TV. The light was crisp (with the shadows freshly chiseled) and the air smelled of spring and tobacco. My dad wrote first. Then I followed. This is how I write now:Hello! says the impostor syndrome (nihilism kicks in): I'm none of the things I wanted to be

## Split

In [54]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

## Store

In [55]:

vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

## Retrieve

In [56]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

## Generate

In [57]:
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [69]:
rag_chain.invoke("Why make toys? Respond under 10 words.")

'Toys serve various purposes, such as improving cognitive skills and managing stress.'

### Cleanup

In [26]:
vectorstore.delete_collection()