In [None]:
import requests
import os
import dotenv
import json
import re
from bs4 import BeautifulSoup
import uuid

dotenv.load_dotenv()

In [None]:
payload = {
    "key": os.getenv("GSE_API_KEY"),
    "cx": os.getenv("GSE_CX"),
    "gl": "br",
    "lr": "lang_pt",
    "q": "google news enchentes rio grande do sul clicrbs",
    "num": 5
}

### Scrap the Google News RSS feed for the latest news

In [None]:
r1 = requests.get("https://news.google.com/rss/search?q=enchentes%25rio%25grande%25do%25sul&ceid=BR:pt-419&hl=pt-BR&gl=BR")
r2 = requests.get("https://news.google.com/rss/search?q=cidade%25atingidas%25pelas%25enchentes%25no%25RS&ceid=BR:pt-419&hl=pt-BR&gl=BR")
r3 = requests.get("https://news.google.com/rss/search?q=chuvas%25no%25rio%25grande%25do%25sul&ceid=BR:pt-419&hl=pt-BR&gl=BR")

In [None]:
# given the response, we can parse the XML and get the items
from xml.etree import ElementTree
from datetime import datetime

def parse_date(date:str) -> str:
    my_date = re.findall(r"[a-zA-Z]{3},\s\d\d\s[a-zA-Z]{3}\s\d{4}", date)[0]
    my_date = datetime.strptime(my_date, "%a, %d %b %Y")
    my_date = my_date.strftime("%d/%m/%Y")

def scrap_rss_news_feed(rss_feeds:list[requests.Response]) -> list[dict]:
    scrapped_news:list[dict] = []
    for rss_feed in rss_feeds:
        parsed = rss_feed.content.decode("utf-8").replace("\n", "")
        root = ElementTree.fromstring(parsed)
        result_items = root.findall("./channel/item")
        for item in result_items[:50]:
            scrapped_news.append({
                "title": item.find("title").text,
                "link": item.find("link").text,
                "description": item.find("description").text,
                "pubDate": parse_date(item.find("pubDate").text)
            })
    return scrapped_news

### Read each news article and extract the text

In [None]:
def cleanText(text):
    # Regex pattern to match isolated special characters
    isolatedSpecialCharacters = r'(?<![a-zA-Z0-9])[^\w\s]|(?<=[^\w\s])[^\w\s](?![a-zA-Z0-9])'
    
    
    cleaned_text = re.sub(isolatedSpecialCharacters, '', text)
    cleaned_text = re.sub(r'\s\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()

    return cleaned_text

In [None]:
cleanText('Publicação: \n\r\n            24/06/2024 às 18h53min')

In [None]:
def get_news_content(scrapped_news_sources:list[dict]) -> list[str]:
    paragraphs_extracted = []
    for index, news_source in enumerate(scrapped_news_sources):
        print(f"Scraping {index+1}/{len(scrapped_news_sources)}")
        try:
            r = requests.get(news_source["link"], timeout=5)
            hmtl = r.content.decode("utf-8")
            soup = BeautifulSoup(hmtl, "html.parser")
        except requests.exceptions.Timeout:
            print(f"Timeout error on {news_source['link']}")
            continue
        except ConnectionError:
            print(f"Connection error on {news_source['link']}")
            continue
        except Exception as e:
            print(f"An error occurred: {e}")
            continue
        # filter to only collect paragraphs
        paragraphs = soup.find_all("p")
        for p in paragraphs:
            p_text = cleanText(p.text)
            p_text_words = p_text.split(" ")
            if p_text and len(p_text_words) > 10:
                paragraphs_extracted.append(p_text)
    return paragraphs_extracted

#### Reading with HTML2Text

In [None]:
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.ignore_emphasis = True
h.ignore_tables = True
h.ignore_anchors = True
h.ignore_backrefs = True

print(h.handle(hmtl.decode("utf-8")))

### Try filtering the data by the context

In [None]:
from langchain_community.llms import HuggingFaceHub
from langchain_community.chat_models.huggingface import ChatHuggingFace


llm = HuggingFaceHub(huggingfacehub_api_token ='',
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.0001,
        "repetition_penalty": 1.03,
    },
)

chat_model = ChatHuggingFace(llm=llm)

In [None]:
chat_model.invoke("Can you tell me a joke?", temperature=0.01)

In [None]:
from langchain.prompts import ChatPromptTemplate

context_definition = "You are a news reviewer. You have been tasked to filter news about floods in Rio Grande do Sul and its associated impacts. Your task is filter the context for the user."

user_prompt = f"""
The context below is a news article about floods in Rio Grande do Sul coming from a scraped news website. Please filter the context to only include the paragraphs related to the floods in Rio Grande do Sul and its impacts.

<<CONTEXT>>
The floods in Rio Grande do Sul have caused significant damage to the region. The floods have affected many people and have caused a lot of destruction. The floods have also caused many people to lose their homes and have caused a lot of damage to the infrastructure of the region.  
"""

template = ChatPromptTemplate.from_messages([
            ("system", context_definition),
            ("human", user_prompt),
        ])

messages = template.format_messages()

response = chat_model.invoke(messages, temperature=0.0001)

In [None]:
response.content.split("</s>")[2]

### Use ChoromaDB to store the chunks

In [None]:
import chromadb
from chromadb.utils import embedding_functions
chroma_client = chromadb.Client()

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-mpnet-base-cos-v1")

qa_collection = chroma_client.create_collection(
    name="rs_floods_qa",
    embedding_function=sentence_transformer_ef,
    metadata={"hnsw:space": "cosine"}
)

In [None]:
news_sources = scrap_rss_news_feed([r1, r2, r3])
documents = get_news_content(news_sources)

In [None]:
ids = [str(uuid.uuid4())[-8:] for _ in range(len(documents))]

qa_collection.add(
    ids=ids,
    documents=documents,
)

In [None]:
qa_collection.query(query_texts="Qual foi o número de pessoas mortas pelas enchentes?", n_results=10)

In [None]:
import dateparser
from dateparser.search import search_dates

import spacy
nlp = spacy.load("pt_core_news_sm")

dateparser.parse("July of 2021")
search_dates("Eu me mudei em Maio do ano passado. Ela se mudou em outubro.", languages=["pt"])

In [None]:
# doc = nlp("Abalada por três vezes pela enchente do Rio Taquari desde setembro do ano passado, a cidade de Roca Sales, no Vale do Taquari, prioriza a reconstrução a cidade por duas frentes: habitação e infraestrutura. De acordo com o prefeito de Roca Sales, Amilton Fontana, por conta das inundações há a necessidade de 400 residências e a desobstrução de estradas e vias públicas.")

# search for consecutive PROPN entities

ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "JONES", "pattern": [{"POS": "PROPN"}, {"POS": "PROPN"}]}]
ruler.add_patterns(patterns)
# tem que substituir o NER oficial do pipe

doc = nlp("Abalada por três vezes pela enchente do Rio Taquari desde setembro do ano passado, a cidade de Roca Sales, no Vale do Taquari, prioriza a reconstrução a cidade por duas frentes: habitação e infraestrutura. De acordo com o prefeito de Roca Sales, Amilton Fontana, por conta das inundações há a necessidade de 400 residências e a desobstrução de estradas e vias públicas.")

In [None]:
for ent in doc:
    print(ent.text, ent.pos_)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
import spacy

nlp = spacy.load("pt_core_news_sm")

text = "João mora na Bahia, 22/11/1985, seu cpf é 111.222.333-11"
ruler = nlp.add_pipe("entity_ruler")
patterns = [
    {"label": "JONES", "pattern": [
            {"SHAPE": "ddd.ddd."},
            {"SHAPE": "ddd-dd"},
    ]},
    {"label": "NOME", "pattern": [
            {"POS": "PROPN"},
    ]},
    {"label": "DATA", "pattern": [
            {"SHAPE": "dd/dd/dddd"},
            {"SHAPE": "dd/dd/dddd"},
    ]},
]

ruler.add_patterns(patterns)
doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)