### Query web data

In [2]:
%pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.13.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Downloading soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.13.3-py3-none-any.whl (186 kB)
Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.13.3 bs4-0.0.2 soupsieve-2.6
Note: you may need to restart the kernel to use updated packages.


## Import Libraries and API keys

In [3]:
import bs4
from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from dotenv import load_dotenv

# Load API keys from .env file 
load_dotenv(override=True)

USER_AGENT environment variable not set, consider setting it to identify your requests.


True

## Step 1.00 Load news article content, split into chunks, and index them.

In [None]:
# Load news article content, split into chunks, and index them.

url = "https://www.bbc.com/news/business-68092814"
loader = WebBaseLoader(
    web_paths=(url,),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            "div",
            attrs={"class": ["article-body fs-article fs-premium fs-responsive-text current-article font-body color-body bg-base font-accent article-subtype__masthead",
                             "header-content-container masthead-header__container"]},
        )
    ),
)
docs = loader.load()

docs = loader.load()
print(f"Number of documents: {len(docs)}")
docs[0].page_content[:500]

Number of documents: 1


''

In [33]:
# Load the contents of the news article, split it into chunks, and index it.
loader = WebBaseLoader(
    web_paths=("https://www.bbc.com/news/business-68092814",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            "main",
            attrs={"id": ["main-content"]},
        )
    ),
)
docs = loader.load()
print(f"Number of documents: {len(docs)}")
docs[0].page_content[:500]

Number of documents: 1


'Could AI \'trading bots\' transform the world of investing?1 February 2024ShareSaveJonty BloomBusiness reporterShareSaveGetty ImagesIt is hard for both humans and computers to predict stock market movementsSearch for "AI investing" online, and you\'ll be flooded with endless offers to let artificial intelligence manage your money.I recently spent half an hour finding out what so-called AI "trading bots" could apparently do with my investments.Many prominently suggest that they can give me lucrative'

In [34]:
print(docs[0].page_content[:1000])
print(docs[0].metadata)
print(docs[0].metadata["source"])


Could AI 'trading bots' transform the world of investing?1 February 2024ShareSaveJonty BloomBusiness reporterShareSaveGetty ImagesIt is hard for both humans and computers to predict stock market movementsSearch for "AI investing" online, and you'll be flooded with endless offers to let artificial intelligence manage your money.I recently spent half an hour finding out what so-called AI "trading bots" could apparently do with my investments.Many prominently suggest that they can give me lucrative returns. Yet as every reputable financial firm warns - your capital may be at risk.Or putting it more simply - you could lose your money - whether it is a human or a computer that is making stock market decisions on your behalf.Yet such has been the hype about the ability of AI over the past few years, that almost one in three investors would be happy to let a trading bot make all the decisions for them, according to one 2023 survey in the US.John Allan says investors should be more cautious ab

In [24]:
# Step 1: Load Documents
# Load the contents of news articles, split them into chunks, and index them.
loader = WebBaseLoader(
    web_paths=("https://www.bbc.com/news/business-68092814",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            "main",
            attrs={"id": ["main-content"]},
        )
    ),
)
docs = loader.load()


# Step 2: Split Documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

splits = text_splitter.split_documents(docs)

# Step 3: Embedding & Create Vectorstore
vectorstore = FAISS.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Step 4: retriever
# Retrieve and generate information contained in the news.
retriever = vectorstore.as_retriever()

# Step 5: Create Prompt
prompt = hub.pull("rlm/rag-prompt")

# Step 6: Create LLM
# Generate the language model (LLM).
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


def format_docs(docs):
    # Combine the retrieved document results into a single paragraph.
    return "\n\n".join(doc.page_content for doc in docs)


# Create Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Step 8: Run Chain
# Input queries about the documents and output answers.
question = "Why did OpenAI and Scarlett Johansson have a conflict?"
response = rag_chain.invoke(question)

# output the results.
print(f"URL: {url}")
print(f"Number of documents: {len(docs)}")
print("===" * 20)
print(f"[HUMAN]\n{question}\n")
print(f"[AI]\n{response}")

URL: https://www.forbes.com/sites/rashishrivastava/2024/05/21/the-prompt-scarlett-johansson-vs-openai/
Number of documents: 1
[HUMAN]
Why did OpenAI and Scarlett Johansson have a conflict?

[AI]
The provided context does not mention any specific conflict between OpenAI and Scarlett Johansson. Therefore, I don't know the answer to the question.
