In [None]:
import requests
import os
import dotenv
import json
import re
from bs4 import BeautifulSoup

dotenv.load_dotenv()

In [None]:
payload = {
    "key": os.getenv("GSE_API_KEY"),
    "cx": os.getenv("GSE_CX"),
    "gl": "br",
    "lr": "lang_pt",
    "q": "google news enchentes rio grande do sul clicrbs",
    "num": 5
}

### Scrap the Google News RSS feed for the latest news

In [131]:
r = requests.get("https://news.google.com/rss/search?q=enchentes%25rio%25grande%25do%25sul&ceid=BR:pt-419&hl=pt-BR&gl=BR")

In [144]:
# given the response, we can parse the XML and get the items
from xml.etree import ElementTree
from datetime import datetime

def parse_date(date:str) -> str:
    my_date = re.findall(r"[a-zA-Z]{3},\s\d\d\s[a-zA-Z]{3}\s\d{4}", date)[0]
    my_date = datetime.strptime(my_date, "%a, %d %b %Y")
    my_date = my_date.strftime("%d/%m/%Y")

def scrap_rss_news_feed(rss_feed:requests.Response) -> list[dict]:
    parsed = rss_feed.content.decode("utf-8").replace("\n", "")
    root = ElementTree.fromstring(parsed)
    result_items = root.findall("./channel/item")
    scrapped_news:list[dict] = []
    for item in result_items:
        scrapped_news.append({
            "title": item.find("title").text,
            "link": item.find("link").text,
            "description": item.find("description").text,
            "pubDate": parse_date(item.find("pubDate").text)
        })
    return scrapped_news

### Read each news article and extract the text

In [None]:
def remove_isolated_special_chars(text):
    # Regex pattern to match isolated special characters
    pattern = r'(?<![a-zA-Z0-9])[^\w\s]|(?<=[^\w\s])[^\w\s](?![a-zA-Z0-9])'
    # Replace isolated special characters with an empty string
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

In [None]:
def get_news_content(scrapped_news_sources:list[dict]) -> list[str]:
    paragraphs_extracted = []
    for news_source in scrapped_news_sources:
        r = requests.get(news_source["link"])
        hmtl = r.content.decode("utf-8")
        soup = BeautifulSoup(hmtl, "html.parser")
        # filter to only collect paragraphs
        paragraphs = soup.find_all("p")
        for p in paragraphs:
            p_text = remove_isolated_special_chars(p.text.strip())
            p_text_words = p_text.split(" ")
            if p_text and len(p_text_words) > 10:
                paragraphs_extracted.append(p_text)
    return paragraphs_extracted

#### Reading with HTML2Text

In [None]:
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.ignore_emphasis = True
h.ignore_tables = True
h.ignore_anchors = True
h.ignore_backrefs = True

print(h.handle(hmtl.decode("utf-8")))

### Try filtering the data by the context

In [None]:
from langchain_community.llms import HuggingFaceHub
from langchain_community.chat_models.huggingface import ChatHuggingFace


llm = HuggingFaceHub(huggingfacehub_api_token ='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',
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.0001,
        "repetition_penalty": 1.03,
    },
)

chat_model = ChatHuggingFace(llm=llm)

In [None]:
chat_model.invoke("Can you tell me a joke?", temperature=0.01)

In [None]:
from langchain.prompts import ChatPromptTemplate

context_definition = "You are a news reviewer. You have been tasked to filter news about floods in Rio Grande do Sul and its associated impacts. Your task is filter the context for the user."

user_prompt = f"""
The context below is a news article about floods in Rio Grande do Sul coming from a scraped news website. Please filter the context to only include the paragraphs related to the floods in Rio Grande do Sul and its impacts.

<<CONTEXT>>
The floods in Rio Grande do Sul have caused significant damage to the region. The floods have affected many people and have caused a lot of destruction. The floods have also caused many people to lose their homes and have caused a lot of damage to the infrastructure of the region.  
"""

template = ChatPromptTemplate.from_messages([
            ("system", context_definition),
            ("human", user_prompt),
        ])

messages = template.format_messages()

response = chat_model.invoke(messages, temperature=0.0001)

In [None]:
response.content.split("</s>")[2]

### Use ChoromaDB to store the chunks

In [None]:
import chromadb
from chromadb.utils import embedding_functions
chroma_client = chromadb.Client()

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-mpnet-base-cos-v1")

qa_collection = chroma_client.create_collection(
    name="rs_floods_qa",
    embedding_function=sentence_transformer_ef,
    metadata={"hnsw:space": "cosine"}
)

In [130]:
qa_collection.query(query_texts="Qual foi o número de mortos pelas enchentes no mês de julho?", n_results=2)

{'ids': [['id2', 'id0']],
 'distances': [[0.35033607482910156, 0.355965793132782]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['Ao todo, 478 municípios foram atingidos e a população afetada chega a 2.398.255 pessoas. Os municípios com o maior número de mortes até agora são Canoas 31 óbitos) Roca Sales 13) e Cruzeiro do Sul 12)',
   'Aumentou para 176 o número de mortes confirmadas no Rio Grande do Sul em decorrência das enchentes que atingiram o estado. Um corpo não identificado foi encontrado em Venâncio Aires, na região do Vale do Rio Pardo, no centro do estado.']],
 'uris': None,
 'data': None}

In [1]:
import dateparser
from dateparser.search import search_dates

import spacy
nlp = spacy.load("en_core_web_sm")

dateparser.parse("July of 2021")
search_dates("Eu me mudei em Maio do ano passado. Ela se mudou em outubro.", languages=["pt"])

[('em Maio', datetime.datetime(2024, 5, 27, 0, 0)),
 ('ano passado', datetime.datetime(2023, 5, 27, 0, 0)),
 ('em outubro', datetime.datetime(2024, 10, 27, 0, 0))]