In [91]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [92]:
from news_summarizer.domain.base.nosql import _database

collection_link = _database.get_collection("link")
indexes = collection_link.find({"source": "https://www.r7.com/"})
indexes = list(indexes)

In [93]:
indexes

[{'_id': '98026507-e638-4159-8d06-a24438bca7e0',
  'title': 'www.r7.com',
  'url': 'https://www.r7.com/',
  'source': 'https://www.r7.com/',
  'published_at': None,
  'extracted_at': datetime.datetime(2024, 12, 5, 20, 11, 57, 667000)},
 {'_id': '88bb7a25-97aa-4cde-b96f-40b8c040afc6',
  'title': '',
  'url': 'https://www.playplus.com/',
  'source': 'https://www.r7.com/',
  'published_at': None,
  'extracted_at': datetime.datetime(2024, 12, 5, 20, 11, 57, 667000)},
 {'_id': '30824731-faff-443c-bc17-a475b768e937',
  'title': 'Abrace a ABADS 2024',
  'url': 'https://www.abads.org.br/',
  'source': 'https://www.r7.com/',
  'published_at': None,
  'extracted_at': datetime.datetime(2024, 12, 5, 20, 11, 57, 667000)},
 {'_id': '852d9c4c-e190-4c7b-a55c-d39ac7222940',
  'title': 'Site Oficial',
  'url': 'https://record.r7.com/acerte-ou-caia/',
  'source': 'https://www.r7.com/',
  'published_at': None,
  'extracted_at': datetime.datetime(2024, 12, 5, 20, 11, 57, 667000)},
 {'_id': 'e0275c4c-e07c-4

In [94]:
collection_link = _database.get_collection("article")
indexes = collection_link.find({"title": "Mauro Cid vai prestar novo depoimento à Polícia Federal nesta quinta-feira"})
indexes = list(indexes)

In [95]:
import locale
from news_summarizer.domain.base.nosql import NoSQLBaseLink
from news_summarizer.scraper.base import BaseSeleniumScraper

In [96]:
from datetime import datetime
from typing import Optional

from pydantic import AnyUrl, Field, field_serializer

class Link(NoSQLBaseLink):
    title: str = Field(..., description="The title of the link")
    url: AnyUrl = Field(description="The URL of the link")
    source: Optional[str] = Field(None, description="The source of the link")
    published_at: Optional[datetime] = Field(None, description="The publication date of the link")
    extracted_at: datetime = Field(
        default_factory=datetime.now,
        description="The timestamp when the link was extracted",
    )

    @field_serializer("url")
    def url_string(self, url: AnyUrl):
        return str(url)

    class Settings:
        name = "link"

from datetime import datetime

class Article(NoSQLBaseLink):
    title: str = Field(..., description="The title of the link")
    subtitle: str = Field(..., description="The subtitle of the link")
    author: str = Field(...,  description="The author")
    publication_date: Optional[datetime] = Field(None, description="The publication date of the link")
    content: str = Field(..., description="Content")
    url: AnyUrl = Field(description="The URL of the link")

    @field_serializer("url")
    def url_string(self, url: AnyUrl):
        return str(url)

    class Settings:
        name = "article"

In [None]:
import re
import time
import numpy as np
from bs4.element import Tag
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class G1Scraper(BaseSeleniumScraper):
    model = Article

    def __init__(self) -> None:
        super().__init__()

    def extract(self, article_link: str, **kwargs) -> None:
        self.driver.get(article_link)
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        if len(self.soup) == 0:
            raise ValueError("No elements scraped from link %s", article_link)

        title = self._extract_title(self.soup)
        author = self._extract_author(self.soup)
        subtitle = self._extract_subtitle(self.soup)
        content = self._extract_content(self.soup)
        publication_date = self._extract_publication_date(self.soup)

        intance = self.model(
            title=title,
            subtitle=subtitle,
            author=author,
            content=content,
            publication_date=publication_date,
            url=article_link,
        )

        intance.save()
    def _extract_title(self, soup: BeautifulSoup):
        title = soup.find("h1", class_="content-head__title").text
        return title

    def _extract_author(self, soup: BeautifulSoup):
        author = soup.find("a", class_="multi_signatures").text
        return author

    def _extract_subtitle(self, soup: BeautifulSoup):
        subtitle = soup.find("h2", class_="content-head__subtitle").text
        return subtitle

    def _extract_content(self, soup: BeautifulSoup):
        paragraphs = soup.select("div.mc-article-body p")
        content = "\n".join([p.text for p in paragraphs if len(p) > 0])
        return content

    def _extract_publication_date(self, soup: BeautifulSoup):
        publication_date = soup.find("time", itemprop="datePublished")["datetime"]
        return publication_date


class R7Scraper(BaseSeleniumScraper):
    model = Article

    def __init__(self) -> None:
        super().__init__()

    def extract(self, article_link: str, **kwargs) -> None:
        self.driver.get(article_link)
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        if len(self.soup) == 0:
            raise ValueError("No elements scraped from link %s", article_link)
        
        title = self._extract_title(self.soup)
        author = self._extract_author(self.soup)
        subtitle = self._extract_subtitle(self.soup)
        content = self._extract_content(self.soup)
        publication_date = self._extract_publication_date(self.soup)

        intance = self.model(
            title=title,
            subtitle=subtitle,
            author=author,
            content=content,
            publication_date=publication_date,
            url=article_link,
        )

        intance.save()

    def _extract_title(self, soup: BeautifulSoup):
        title = soup.find(
            "h1",
            class_="base-font-primary dark:base-text-neutral-high-400 base-mb-xxxs base-text-xl base-font-semibold base-leading-xxl lg:base-leading-giant lg:base-text-xxl base-text-neutral-low-500",
        ).text
        return title

    def _extract_author(self, soup: BeautifulSoup):
        first_span = soup.find(
            "span",
            class_="article-text-editorial-color article-ml-quark article-mr-quark dark:!article-text-neutral-high-400",
        )
        author = first_span.find_next("span").text
        return author

    def _extract_subtitle(self, soup: BeautifulSoup):
        subtitle = soup.find(
            "h2",
            class_="base-font-primary dark:base-text-neutral-high-400 base-text-xxs base-font-bold base-leading-md sm:base-text-md sm:base-font-medium sm:base-leading-lg base-text-neutral-low-500",
        ).text
        return subtitle

    def _extract_content(self, soup: BeautifulSoup):
        article = soup.find("article", class_="b-article-body")
        paragraphs = article.find_all_next("span", class_="b-article-body__text")
        content = "\n".join([p.text for p in paragraphs if len(p) > 0])
        return content

    def _extract_publication_date(self, soup: BeautifulSoup):
        publication_date = soup.find("time", itemprop="datePublished")["datetime"]
        modified_date = soup.find("time", itemprop="dateModified")["datetime"]
        if modified_date is None:
            return publication_date
        return modified_date
    

class BandScraper(BaseSeleniumScraper):
    model = Article

    def __init__(self) -> None:
        super().__init__()

    def extract(self, article_link: str, **kwargs) -> None:
        self.driver.get(article_link)
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        if len(self.soup) == 0:
            raise ValueError("No elements scraped from link %s", article_link)
        
        title = self._extract_title(self.soup)
        author = self._extract_author(self.soup)
        subtitle = self._extract_subtitle(self.soup)
        content = self._extract_content(self.soup)
        publication_date = self._extract_publication_date(self.soup)

        intance = self.model(
            title=title,
            subtitle=subtitle,
            author=author,
            content=content,
            publication_date=publication_date,
            url=article_link,
        )

        intance.save()

    def _extract_title(self, soup: BeautifulSoup):
        h1_element = soup.find(
            "h1",
            class_="cs-entry__title",
        )
        title = h1_element.find('span').text
        return title

    def _extract_author(self, soup: BeautifulSoup):
        author = soup.find(
            "span",
            class_="cs-meta-author-name",
        ).text
        return author

    def _extract_subtitle(self, soup: BeautifulSoup):
        subtitle = soup.find(
            "div",
            class_="cs-entry__subtitle",
        ).text
        return subtitle

    def _extract_content(self, soup: BeautifulSoup):
        article = soup.find("div", class_="cs-entry__content-wrap")
        paragraphs = article.find_all("p")
        content = "\n".join([p.text for p in paragraphs if len(p) > 0])
        return content

    def _translate_months(self, date_str: str):
        months_dict = {
            "janeiro": "January", "fevereiro": "February", "março": "March",
            "abril": "April", "maio": "May", "junho": "June",
            "julho": "July", "agosto": "August", "setembro": "September",
            "outubro": "October", "novembro": "November", "dezembro": "December"
        }

        # Traduz o nome do mês para inglês
        for month_pt, month_en in months_dict.items():
            date_str = date_str.replace(month_pt, month_en)

        return date_str


    def _extract_publication_date(self, soup: BeautifulSoup):
        date_string = soup.find("div", class_='cs-meta-date').text
        date_string = self._translate_months(date_string)
        date_format = "%B %d, %Y"
        publication_date = datetime.strptime(date_string, date_format)
        return publication_date

In [98]:
scraper = R7Scraper()
scraper.extract(article_link='https://noticias.r7.com/brasilia/comissao-do-senado-sabatina-indicados-de-lula-ao-bc-na-proxima-terca-feira-05122024/')

INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Driver [/home/llya/.wdm/drivers/edgedriver/linux64/131.0.2903.87/msedgedriver] found in cache
INFO:news_summarizer.domain.base.nosql:Inserting: {'title': 'Comissão do Senado sabatina indicados de Lula ao BC na próxima terça-feira', 'subtitle': 'Se aprovados na comissão, o grupo segue para análise do plenário da Casa', 'author': 'Rute Moraes, do R7, em Brasília', 'publication_date': datetime.datetime(2024, 12, 5, 23, 23, 12, 728000, tzinfo=TzInfo(UTC)), 'content': 'O presidente da CAE (Comissão de Assuntos Econômicos) do Senado, Vanderlan Cardoso, marcou para a próxima terça-feira (10) a sabatina dos três indicados pelo presidente Luiz Inácio Lula da Silva para a diretoria do BC (Banco Central).\nEm 29 de novembro, o chefe do Executivo indicou os servidores de carreira do banco Izabela Correa e Gilneu Vivan e o banqueiro do Bradesco Nilton David aos postos. A sa

In [99]:
scraper = G1Scraper()
scraper.extract(article_link="https://g1.globo.com/saude/noticia/2024/12/04/o-que-acontece-com-o-corpo-quando-a-vitamina-d-esta-baixa.ghtml")

INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Driver [/home/llya/.wdm/drivers/edgedriver/linux64/131.0.2903.87/msedgedriver] found in cache
INFO:news_summarizer.domain.base.nosql:Inserting: {'title': 'O que acontece com o corpo quando a vitamina D está baixa?', 'subtitle': 'Vitamina é sintetizada no corpo durante a exposição ao sol. Baixos níveis podem causar problemas como osteoporose e raquitismo. ', 'author': 'Júlia Carvalho', 'publication_date': datetime.datetime(2024, 12, 4, 3, 30, 15, 687000, tzinfo=TzInfo(-03:00)), 'content': ' Exposição sol é responsável pela sintetização da vitamina D no corpo. — Foto: Freepik \n Mais do que ajudar a manter um aspecto saudável e corado, poucos minutos diários de sol são responsáveis por evitar um problema que tem se tornado cada vez mais comum na população: a deficiência de vitamina D. \n ☀️Isso porque a exposição solar desencadeia o processo de síntese do colecal

In [100]:
scraper = BandScraper()
scraper.extract(article_link='https://bandnewstv.uol.com.br/2024/12/05/mauro-cid-vai-prestar-novo-depoimento-a-policia-federal-nesta-quinta-feira/')



INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Driver [/home/llya/.wdm/drivers/edgedriver/linux64/131.0.2903.87/msedgedriver] found in cache
INFO:news_summarizer.domain.base.nosql:Inserting: {'title': 'Mauro Cid vai prestar novo depoimento à Polícia Federal nesta quinta-feira', 'subtitle': '\n\t\t\t\t\tIntimação não especifica sobre qual investigação o militar será ouvido\t\t\t\t', 'author': 'Redação BandNews', 'publication_date': datetime.datetime(2024, 12, 5, 0, 0), 'content': 'O tenente-coronel Mauro Cid, que foi ajudante de ordens do ex-presidente Jair Bolsonaro, vai prestar um novo depoimento à Polícia Federal nesta quinta-feira, às três horas da tarde. A intimação não especifica sobre qual investigação o militar será ouvido.\nTanto Mauro Cid quanto Bolsonaro foram indiciados pela Polícia Federal por suposta tentativa de golpe de Estado. Mauro Cid chegou a ser preso, mas foi solto depois de fechar um a