In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.schema import Document
from dotenv import load_dotenv
import requests
import openai 
import shutil
import os

os.environ['OPENAI_API_KEY'] = 'key'
CHROMA_PATH = "chroma_news"

In [2]:
api_key = 'key'
api_url = f'https://newsapi.org/v2/everything?q=technology&sortBy=publishedAt&apiKey={api_key}'

def fetch_news_articles(api_url):
    response = requests.get(api_url)
    response.raise_for_status()
    return response.json()['articles']

articles = fetch_news_articles(api_url)

In [3]:
len(articles[15]['description'])

160

In [4]:
documents = [Document(page_content=article['content'], metadata={'source': article['url'], 'title': article['title']}) for article in articles if article['content']]


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
    length_function=len,
    add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f'Split {len(documents)} documents into {len(chunks)} chunks.')

Split 100 documents into 199 chunks.


In [6]:
document = chunks[10]
print(document.page_content)
print(document.metadata)

railway safety and the data-related market, … [+4595 chars]
{'source': 'https://www.marketscreener.com/quote/stock/RAIL-VISION-LTD-135899306/news/Rail-Vision-Successfully-Installs-AI-Driven-System-for-Class-1-USA-Operator-46939340/', 'title': 'Rail Vision Successfully Installs AI-Driven System for Class 1 USA Operator', 'start_index': 155}


In [7]:
# Create a new DB from the documents.
embedding_function = OpenAIEmbeddings()
db = Chroma.from_documents(
    chunks, embedding_function, persist_directory=CHROMA_PATH
)

print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")   

Saved 199 chunks to chroma_news.


In [8]:
# Prepare the DB.
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

In [19]:
query_text = 'Are there news about any country?'

In [20]:
# Search the DB.
results = db.similarity_search_with_relevance_scores(query_text, k=3)

In [21]:
if len(results) == 0 or results[0][1] < 0.7:
    print(f'Unable to find matching results.')    

In [22]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [23]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)

print(prompt)

model = ChatOpenAI()
response_text = model.invoke(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)


Human: 
Answer the question based only on the following context:

News HomeMentioned in this article

---

News HomeMentioned in this article

---

Every morning, find a selection of analysts' recommendations covering North America and the UK. The list only includes companies that … [+236 chars]

---

Answer the question based on the above context: Are there news about any country?

Response: content='Based on the context provided, there are news mentioned about North America and the UK.' response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 85, 'total_tokens': 102}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-86f6cd42-218a-406a-8446-15a5c8cc5611-0' usage_metadata={'input_tokens': 85, 'output_tokens': 17, 'total_tokens': 102}
Sources: ['https://www.investorsobserver.com/news/qm-pr/4589254380217143', 'https://www.investorsobserver.com/news/qm-pr/4589254380217143', 'https://www.marketscreener.com/new