In [4]:
!pip install beautifulsoup4==4.12.3
!pip install certifi==2024.8.30
!pip install charset-normalizer==3.3.2
!pip install filelock==3.16.1
!pip install fsspec==2024.9.0
!pip install huggingface-hub==0.25.0
!pip install idna==3.10
!pip install Jinja2==3.1.4
!pip install MarkupSafe==2.1.5
!pip install mpmath==1.3.0
!pip install networkx==3.3
!pip install numpy==2.1.1
!pip install packaging==24.1
!pip install PyYAML==6.0.2
!pip install regex==2024.9.11
!pip install requests==2.32.3
!pip install safetensors==0.4.5
!pip install setuptools==75.1.0
!pip install soupsieve==2.6
!pip install sympy==1.13.3
!pip install tokenizers==0.19.1
!pip install torch==2.4.1
!pip install tqdm==4.66.5
!pip install transformers==4.44.2
!pip install typing_extensions==4.12.2
!pip install urllib3==2.2.3
!pip install emoji==0.6.0

Collecting beautifulsoup4==4.12.3
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Installing collected packages: beautifulsoup4
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.12.2
    Uninstalling beautifulsoup4-4.12.2:
      Successfully uninstalled beautifulsoup4-4.12.2
Successfully installed beautifulsoup4-4.12.3
Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[1 lines of output][0m
  [31m   [0m ERROR: Can not execute `setup.py` since setuptools is not available i

In [3]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import hashlib


def fetch_news_valor():
    
    url = "https://valor.globo.com"
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        news_list = []
        
        highlight_content = soup.find_all('div', class_='highlight__content')
        for content in highlight_content:
            content_dict = {}
            content_title = content.find('h2', class_='highlight__title').find('a')

            title = content_title.get_text().strip()

            content_dict['id'] = generate_id(title)
            content_dict['title'] = title
            content_dict['link'] = content_title['href']
            content_dict['sentiment'] = ''
            content_dict['sentiment_score'] = 0
            content_dict['scrapped_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            news_list.append(content_dict)

        
        highlight_links = soup.find_all('div', class_='highlight__links')
        for content in highlight_links:
            list_link = content.find('ul').find_all('a')
            for link in list_link:
                content_dict = {}

                title = link.get_text().strip()
                content_dict['id'] = generate_id(title)
                content_dict['title'] = title
                content_dict['link'] = link['href']
                content_dict['sentiment'] = ''
                content_dict['sentiment_score'] = 0
                content_dict['scrapped_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                news_list.append(content_dict)

        return news_list
    else:
        print("Falha ao acessar o site")
        return []

def fetch_news_cnn():
    url = "https://www.cnnbrasil.com.br/economia/"
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        news_list = []

        news_title = soup.find_all(['h3', 'h2'], class_=['block__news__title', 'news-item-header__title'])
        for title in news_title:
            content_dict = {}
            link = title.find_parent('a')
            if link:
                title = title.get_text().strip()    
                content_dict['id'] = generate_id(title)
                content_dict['title'] = title
                content_dict['link'] = link['href']
                content_dict['sentiment'] = ''
                content_dict['sentiment_score'] = 0
                content_dict['scrapped_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

                news_list.append(content_dict)

        return news_list
    else:
        print("Falha ao acessar o site")
        return []

def save_csv(lista, filename):
    headers = list(lista[0].keys())

    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=headers, delimiter='|')
            
            writer.writeheader()
            
            for row in lista:

                writer.writerow(row)
        
        print(f"Arquivo '{filename}' criado com sucesso.")
    except IOError as e:
        print(f"Erro ao criar o arquivo: {e}")

def generate_id(title):
    return hashlib.sha256(title.encode('utf-8')).hexdigest()[:16]

def predict_sentiment(news_list):
    print("Predizendo sentimento...")
    tokenizer = AutoTokenizer.from_pretrained("pysentimiento/bertweet-pt-sentiment", clean_up_tokenization_spaces=True)
    bert = AutoModelForSequenceClassification.from_pretrained("pysentimiento/bertweet-pt-sentiment")

    labels = {0: "Negativo", 1: "Neutro", 2: "Positivo"}
    with torch.no_grad():
        for news in news_list:
            print(f"Sentimento da notícia: {news['title']}", end="... ")
            inputs = tokenizer(news['title'], return_tensors="pt", truncation=True)
            outputs = bert(**inputs)

            probabilities = torch.softmax(outputs.logits, dim=1).cpu().numpy()
            
            sentiment = labels[np.argmax(probabilities)]
            sentiment_score = np.max(probabilities)

            news['sentiment'] = sentiment
            news['sentiment_score'] = sentiment_score

            print(f"Sentimento: {sentiment}, Score: {sentiment_score}")


    return news_list

def main():
    news = fetch_news_cnn()
    news += fetch_news_valor()
    news = predict_sentiment(news)
    save_csv(news, f'news_{datetime.now().strftime("%Y-%m-%d_%H")}.csv')
main()


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Predizendo sentimento...
Sentimento da notícia: Brasil pode atender demanda do setor de tecnologia com energia verde, dizem especialistas... Sentimento: Neutro, Score: 0.9177208542823792
Sentimento da notícia: Energia verde é caminho para barateamento e descarbonização de processos, dizem especialistas... Sentimento: Neutro, Score: 0.8592284917831421
Sentimento da notícia: Alta da Selic deve elevar pedidos de recuperação judicial, vê consultoria... Sentimento: Neutro, Score: 0.9382232427597046
Sentimento da notícia: Alta dos juros pode impactar projetos de infraestrutura... Sentimento: Neutro, Score: 0.952080488204956
Sentimento da notícia: Análise: postura mais agressiva do Fed é destoante e arriscada... Sentimento: Neutro, Score: 0.8147657513618469
Sentimento da notícia: Marca própria ou franquia? Como decidir a melhor alternativa ao empreender... Sentimento: Neutro, Score: 0.9317739605903625
Sentimento da notícia: Zuckerberg entra para o clube exclusivo de pessoas com mais de US$ 20