In [185]:
import requests
import os
import dotenv
import json
import re
from bs4 import BeautifulSoup
import uuid
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

dotenv.load_dotenv()

True

In [None]:
# payload = {
#     "key": os.getenv("GSE_API_KEY"),
#     "cx": os.getenv("GSE_CX"),
#     "gl": "br",
#     "lr": "lang_pt",
#     "q": "google news enchentes rio grande do sul clicrbs",
#     "num": 5
# }

### Scrap the Google News RSS feed for the latest news

In [186]:
searchStrings = [
    "enchentes rio grande do sul",
    "cidade atingidas pelas enchentes no RS",
    "chuvas no rio grande do sul",
    "danos nas cidades do vale do taquari",
    "chuvas no vale do taquari",
    "chuvas em lajeado",
    "maior enchente em porto alegre",
    "chuvas em porto alegre",
    "abrigos de animais em porto alegre",
    "abrigos pós enchentes em porto alegre",
]

In [187]:
# This method will receive phrases like a user search and return the list of request objects
def requestFactory(searchStrings:list[str]):
    requestsList = []
    for searchString in searchStrings:
        searchStringFormatted = re.sub(r"\s+", "%25", searchString)
        r = requests.get(f"https://news.google.com/rss/search?q={searchStringFormatted}&ceid=BR:pt-419&hl=pt-BR&gl=BR")
        requestsList.append(r)
    return requestsList

In [188]:
# given the response, we can parse the XML and get the items
from xml.etree import ElementTree
from datetime import datetime

def parse_date(date:str) -> str:
    my_date = re.findall(r"[a-zA-Z]{3},\s\d\d\s[a-zA-Z]{3}\s\d{4}", date)[0]
    my_date = datetime.strptime(my_date, "%a, %d %b %Y")
    my_date = my_date.strftime("%d/%m/%Y")
    return my_date

def scrap_rss_news_feed(rss_feeds:list[requests.Response]) -> list[dict]:
    scrapped_news:list[dict] = []
    for rss_feed in rss_feeds:
        parsed = rss_feed.content.decode("utf-8").replace("\n", "")
        root = ElementTree.fromstring(parsed)
        result_items = root.findall("./channel/item")
        for item in result_items[:50]:
            scrapped_news.append({
                "title": item.find("title").text,
                "link": item.find("link").text,
                "description": item.find("description").text,
                "pubDate": parse_date(item.find("pubDate").text)
            })
    return scrapped_news

### Read each news article and extract the text

In [189]:
def cleanText(text):
    # Regex pattern to match isolated special characters
    isolatedSpecialCharacters = r'(?<![a-zA-Z0-9])[^\w\s]|(?<=[^\w\s])[^\w\s](?![a-zA-Z0-9])'
    
    
    cleaned_text = re.sub(isolatedSpecialCharacters, '', text)
    cleaned_text = re.sub(r'\s\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()

    return cleaned_text

cleanText('Publicação: \n\r\n            24/06/2024 às 18h53min')

'Publicação: 24/06/2024 às 18h53min'

In [211]:
def get_news_content(scrapped_news_sources:list[dict]) -> list[str]:
    rs_cities = open("rs_cities.txt", "r").read().split("\n")
    rs_cities = [city.lower() for city in rs_cities]
    for _, news_source in tqdm(list(enumerate(scrapped_news_sources))):
        try:
            r = requests.get(news_source["link"], timeout=5)
            hmtl = r.content.decode("utf-8")
            soup = BeautifulSoup(hmtl, "html.parser")
        except requests.exceptions.Timeout:
            print(f"Timeout error on {news_source['link']}")
            continue
        except ConnectionError:
            print(f"Connection error on {news_source['link']}")
            continue
        except Exception as e:
            print(f"An error occurred: {e}")
            continue
        # filter to only collect paragraphs
        paragraphs = soup.find_all("p")
        full_paragraphs = " ".join([p.text for p in paragraphs])
        full_paragraphs = cleanText(full_paragraphs)
        full_paragraphs = full_paragraphs.encode("utf-8").decode("utf-8")
        rule_identifier = getNLPWithRuler()
        identified_cities = set()
        for entity in rule_identifier(full_paragraphs).ents:
            if entity.label_ == "CITY" and entity.text.lower() in rs_cities:
                identified_cities.add(entity.text)
        for p in paragraphs:
            p_text = cleanText(p.text)
            p_text_words = p_text.split(" ")
            if p_text and len(p_text_words) > 10:
                if "docs" not in news_source:
                    news_source["docs"] = []
                p_text = p_text.encode("utf-8").decode("utf-8")
                news_source["docs"].append(p_text)
                news_source["cities"] = list(identified_cities)
                # paragraphs_extracted.append(p_text)
    return scrapped_news_sources

#### Reading with HTML2Text

In [None]:
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.ignore_emphasis = True
h.ignore_tables = True
h.ignore_anchors = True
h.ignore_backrefs = True

print(h.handle(hmtl.decode("utf-8")))

### Try filtering the data by the context

In [None]:
from langchain_community.llms import HuggingFaceHub
from langchain_community.chat_models.huggingface import ChatHuggingFace


llm = HuggingFaceHub(huggingfacehub_api_token ='',
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.0001,
        "repetition_penalty": 1.03,
    },
)

chat_model = ChatHuggingFace(llm=llm)

In [None]:
chat_model.invoke("Can you tell me a joke?", temperature=0.01)

In [None]:
from langchain.prompts import ChatPromptTemplate

context_definition = "You are a news reviewer. You have been tasked to filter news about floods in Rio Grande do Sul and its associated impacts. Your task is filter the context for the user."

user_prompt = f"""
The context below is a news article about floods in Rio Grande do Sul coming from a scraped news website. Please filter the context to only include the paragraphs related to the floods in Rio Grande do Sul and its impacts.

<<CONTEXT>>
The floods in Rio Grande do Sul have caused significant damage to the region. The floods have affected many people and have caused a lot of destruction. The floods have also caused many people to lose their homes and have caused a lot of damage to the infrastructure of the region.  
"""

template = ChatPromptTemplate.from_messages([
            ("system", context_definition),
            ("human", user_prompt),
        ])

messages = template.format_messages()

response = chat_model.invoke(messages, temperature=0.0001)

In [None]:
response.content.split("</s>")[2]

### Use ChoromaDB to store the chunks

In [212]:
news_sources = scrap_rss_news_feed(requestFactory(searchStrings))
news_sources = list({v['link']:v for v in news_sources}.values())

documents = get_news_content(news_sources)
# with open("news_sources.json", "w") as f:
#     json.dump(documents, f)

  0%|          | 0/468 [00:00<?, ?it/s]

An error occurred: HTTPSConnectionPool(host='www.estado.rs.gov.br', port=443): Max retries exceeded with url: /defesa-civil-atualiza-balanco-das-enchentes-no-rs-2-7-18h (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))
An error occurred: 'utf-8' codec can't decode byte 0xfa in position 87: invalid start byte
Timeout error on https://news.google.com/rss/articles/CBMijgFodHRwczovL2FnZW5jaWFicmFzaWwuZWJjLmNvbS5ici9yYWRpb2FnZW5jaWEtbmFjaW9uYWwvZ2VyYWwvYXVkaW8vMjAyNC0wNy9zb2JlLXBhcmEtMTgwLW51bWVyby1kZS1tb3J0b3MtcGVsYXMtZW5jaGVudGVzLW5vLXJpby1ncmFuZGUtZG8tc3Vs0gEA?oc=5
Timeout error on https://news.google.com/rss/articles/CBMidWh0dHBzOi8vYWdlbmNpYWJyYXNpbC5lYmMuY29tLmJyL2dlcmFsL25vdGljaWEvMjAyNC0wNS9xdWFzZS05MC1kYXMtY2lkYWRlcy1kby1ycy1mb3JhbS1hdGluZ2lkYXMtcGVsYXMtZm9ydGVzLWNodXZhc9IBAA?oc=5
An error occurred: HTTPSConnectionPool(host='www.defesacivil.rs.gov.br', port=443): Max 

In [259]:
def docsPostProcessor(docs=None):
    if not docs:
        docs = json.loads(open("news_sources.json", "r").read())
    docs_df = pd.DataFrame(docs)
    docs_df = docs_df.explode("docs").dropna(subset=["docs"])
    docs_df = docs_df.drop_duplicates(subset=["docs"])
    docs_df["cities"] = docs_df["cities"].apply(lambda x: " ".join([city.replace(" ", "_") for city in x]))
    docs_df = docs_df.reset_index(drop=True)
    return docs_df

docs_df = docsPostProcessor(documents)
docs_df["hash"] = docs_df["docs"].apply(lambda x: uuid.uuid4().hex)
# docs_df.to_json("news_sources_db_ready.json", orient="records")
docs_df

Unnamed: 0,title,link,description,pubDate,docs,cities,hash
0,Governo do RS inaugura primeira 'cidade provis...,https://news.google.com/rss/articles/CBMinAFod...,"<a href=""https://news.google.com/rss/articles/...",04/07/2024,"Centro Humanitário de Acolhimento, o Recomeço,...",Canoas,5c8310daa6b34ddfae47e88084159949
1,Governo do RS inaugura primeira 'cidade provis...,https://news.google.com/rss/articles/CBMinAFod...,"<a href=""https://news.google.com/rss/articles/...",04/07/2024,"A primeira cidade provisória"" do Rio Grande do...",Canoas,318d632cd2894d9388142136b430cf03
2,Governo do RS inaugura primeira 'cidade provis...,https://news.google.com/rss/articles/CBMinAFod...,"<a href=""https://news.google.com/rss/articles/...",04/07/2024,"Inicialmente chamado de cidade provisória"" pel...",Canoas,348fb856b02f4592984482b989af4932
3,Governo do RS inaugura primeira 'cidade provis...,https://news.google.com/rss/articles/CBMinAFod...,"<a href=""https://news.google.com/rss/articles/...",04/07/2024,Confira imagens do Centro Humanitário de Acolh...,Canoas,44218f7a8436434ba262799afa0c4e3f
4,Governo do RS inaugura primeira 'cidade provis...,https://news.google.com/rss/articles/CBMinAFod...,"<a href=""https://news.google.com/rss/articles/...",04/07/2024,"De acordo com o governo do RS, a estrutura foi...",Canoas,54780965be584e1fa759f193356dbee4
...,...,...,...,...,...,...,...
5030,Porto Alegre terá abrigo exclusivo para mulher...,https://news.google.com/rss/articles/CBMirgFod...,"<a href=""https://news.google.com/rss/articles/...",10/05/2024,Lauro Jardim: Quem pediu a Elon Musk ajuda par...,Canoas colorado Porto_Alegre Viamão,f4f6d9e72c8e42bebb12f83c0d4caa0f
5031,Porto Alegre terá abrigo exclusivo para mulher...,https://news.google.com/rss/articles/CBMirgFod...,"<a href=""https://news.google.com/rss/articles/...",10/05/2024,Cinco ocorrências de estupro foram registradas...,Canoas colorado Porto_Alegre Viamão,4f77ffeaa6604f71a772e278efeb5133
5032,Porto Alegre terá abrigo exclusivo para mulher...,https://news.google.com/rss/articles/CBMirgFod...,"<a href=""https://news.google.com/rss/articles/...",10/05/2024,Escolhas políticas estão na origem de eventos ...,Canoas colorado Porto_Alegre Viamão,bdb12f9d2da5415ba6fd67048a549cbd
5033,Porto Alegre terá abrigo exclusivo para mulher...,https://news.google.com/rss/articles/CBMirgFod...,"<a href=""https://news.google.com/rss/articles/...",10/05/2024,Os abrigos não contam com a presença permanent...,Canoas colorado Porto_Alegre Viamão,2487d0b76d6946058354fcb290be9441


In [226]:
import chromadb
from chromadb.utils import embedding_functions
chroma_client = chromadb.Client()

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-mpnet-base-cos-v1")

qa_collection = chroma_client.get_or_create_collection(
    name="rs_floods_qa",
    embedding_function=sentence_transformer_ef,
    metadata={"hnsw:space": "cosine"}
)

In [227]:
docs = pd.read_json("news_sources_db_ready.json", orient="records")

docs.apply(lambda x: qa_collection.add(
    ids=x["hash"],
    documents=x["docs"],
    metadatas={
        "title": x["title"],
        "link": x["link"],
        "pubDate": x["pubDate"],
        "cities": x["cities"]
    }),
    axis=1)

ValueError: Expected metadata value to be a str, int, float or bool, got ['Canoas'] which is a list

In [None]:
qa_collection.query(query_texts="Qual foi o número de pessoas mortas pelas enchentes?", n_results=10)

### Testing

In [137]:
import dateparser
from dateparser.search import search_dates

import spacy
nlp = spacy.load("pt_core_news_sm")

dateparser.parse("July of 2021")
search_dates("Eu me mudei em Maio do ano passado. Ela se mudou em outubro.", languages=["pt"])

[('em Maio', datetime.datetime(2024, 5, 4, 0, 0)),
 ('ano passado', datetime.datetime(2023, 5, 4, 0, 0)),
 ('em outubro', datetime.datetime(2024, 10, 4, 0, 0))]

In [183]:
from collections import Counter

def getCityPosPattern() -> list[str]:
    rs_cities = open("rs_cities.txt", "r").read().split("\n")
    list_of_patterns = []
    for city in rs_cities:
        city_doc = nlp(city)
        str_pattern = " ".join([ent.pos_ for ent in city_doc])
        list_of_patterns.append(str_pattern)

    list_of_patterns = list(filter(lambda x: x not in ["NOUN", "VERB", "PROPN", "ADJ"], list_of_patterns))
    # print(Counter(list_of_patterns).most_common(10))
    # print(sum([count for _, count in Counter(list_of_patterns).most_common(10)]))
    most_commons = [pattern.split(" ") for pattern, _ in Counter(list_of_patterns).most_common(10)]
    # print(most_commons)
    rulerPatterns = []
    for pattern in most_commons:
        rulerPatterns.append({"label": "CITY", "pattern": [{"POS": pos} for pos in pattern]})
    return rulerPatterns

In [184]:
import spacy

import warnings
warnings.filterwarnings("ignore")

def getNLPWithRuler():
    nlp = spacy.load("pt_core_news_sm")

    nlp.remove_pipe("ner")
    text = "João morou em São Jerônimo do Sul por 5 anos. Ele se mudou para Lajeado em 2019. Encantado com a cidade, ele decidiu ficar. Vale do Taquari é uma região muito bonita."
    ruler = nlp.add_pipe("entity_ruler")
    rs_cities = open("rs_cities.txt", "r").read().split("\n")
    rs_cities = [city.lower() for city in rs_cities]

    ruler.add_patterns(getCityPosPattern())
    ruler.add_patterns([{"label": "CITY", "pattern": [{"LOWER": {"IN": rs_cities}}]}])

    return nlp