In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from news_summarizer.domain.base.nosql import _database

collection_link = _database.get_collection("link")
indexes = collection_link.find({"source": "https://g1.globo.com/"})
indexes = list(indexes)

INFO:news_summarizer.database.mongo:Successfully connected to Mongo at: localhost


In [6]:
def search_garbage_links(collection):
    filter_query = {
        "$or": [
            {"published_at": {"$exists": False}},
            {"url": {"$not": {"$regex": "noticia|news", "$options": "i"}}},
        ]
    }
    cursor = collection.find(filter_query)
    for duplicate_group in cursor:
        yield duplicate_group

def drop_garbage_links(collection, garbage):
    for document in garbage:
        collection.delete_one({"_id": document["_id"]})

garbage = search_garbage_links(collection_link)
drop_garbage_links(collection_link, garbage)

In [4]:
collection_article = _database.get_collection("article")
indexes = collection_article.find(**{})
indexes = list(indexes)

In [None]:
def search_duplicates(collection, group_by=None):
    pipeline = [
        {
            "$group": {
                "_id": f"${group_by}",
                "count": {"$sum": 1},
                "ids": {"$push": "$_id"}
            }
        },
        {"$match": {"count": {"$gt": 1}}}
    ]

    cursor = collection.aggregate(pipeline, allowDiskUse=True)
    for duplicate_group in cursor:
        yield duplicate_group


def drop_duplicates(collection, duplicates_iterator, sort_key=None, descending=True):
    for duplicate in duplicates_iterator:
        ids = duplicate["ids"]

        # Fetch full documents for all IDs in this group
        docs = list(collection.find({"_id": {"$in": ids}}))
        if not docs:
            print(f"No documents found for IDs: {ids}")
            continue

        if sort_key:
            docs.sort(key=lambda doc: doc.get(sort_key, None), reverse=descending)

        ids_to_remove = [doc["_id"] for doc in docs[1:]]

        result = collection.delete_many({"_id": {"$in": ids_to_remove}})

        return result

duplicates = search_duplicates(collection_link, group_by='url')
drop_duplicates(collection_link, duplicates, sort_key='extracted_at')

DeleteResult({'n': 2, 'ok': 1.0}, acknowledged=True)

In [None]:
from datetime import datetime
from typing import Optional
from news_summarizer.domain.base.nosql import NoSQLBaseLink
from news_summarizer.scraper.base import BaseSeleniumScraper

from pydantic import AnyUrl, Field, field_serializer

class Link(NoSQLBaseLink):
    title: str = Field(..., description="The title of the link")
    url: AnyUrl = Field(description="The URL of the link")
    source: Optional[str] = Field(None, description="The source of the link")
    published_at: Optional[datetime] = Field(None, description="The publication date of the link")
    extracted_at: datetime = Field(
        default_factory=datetime.now,
        description="The timestamp when the link was extracted",
    )

    @field_serializer("url")
    def url_string(self, url: AnyUrl):
        return str(url)

    class Settings:
        name = "link"

from datetime import datetime

class Article(NoSQLBaseLink):
    title: str = Field(..., description="The title of the link")
    subtitle: str = Field(..., description="The subtitle of the link")
    author: str = Field(...,  description="The author")
    publication_date: Optional[datetime] = Field(None, description="The publication date of the link")
    content: str = Field(..., description="Content")
    url: AnyUrl = Field(description="The URL of the link")

    @field_serializer("url")
    def url_string(self, url: AnyUrl):
        return str(url)

    class Settings:
        name = "article"

In [None]:
from bs4 import BeautifulSoup

class G1Scraper(BaseSeleniumScraper):
    model = Article

    def __init__(self) -> None:
        super().__init__()

    def extract(self, article_link: str, **kwargs) -> None:
        self.driver.get(article_link)
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        if len(self.soup) == 0:
            raise ValueError("No elements scraped from link %s", article_link)

        title = self._extract_title(self.soup)
        author = self._extract_author(self.soup)
        subtitle = self._extract_subtitle(self.soup)
        content = self._extract_content(self.soup)
        publication_date = self._extract_publication_date(self.soup)

        intance = self.model(
            title=title,
            subtitle=subtitle,
            author=author,
            content=content,
            publication_date=publication_date,
            url=article_link,
        )

        intance.save()
    def _extract_title(self, soup: BeautifulSoup):
        title = soup.find("h1", class_="content-head__title").text
        return title

    def _extract_author(self, soup: BeautifulSoup):
        author = soup.find("a", class_="multi_signatures").text
        return author

    def _extract_subtitle(self, soup: BeautifulSoup):
        subtitle = soup.find("h2", class_="content-head__subtitle").text
        return subtitle

    def _extract_content(self, soup: BeautifulSoup):
        paragraphs = soup.select("div.mc-article-body p")
        content = "\n".join([p.text for p in paragraphs if len(p) > 0])
        return content

    def _extract_publication_date(self, soup: BeautifulSoup):
        publication_date = soup.find("time", itemprop="datePublished")["datetime"]
        return publication_date


class R7Scraper(BaseSeleniumScraper):
    model = Article

    def __init__(self) -> None:
        super().__init__()

    def extract(self, article_link: str, **kwargs) -> None:
        self.driver.get(article_link)
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        if len(self.soup) == 0:
            raise ValueError("No elements scraped from link %s", article_link)
        
        title = self._extract_title(self.soup)
        author = self._extract_author(self.soup)
        subtitle = self._extract_subtitle(self.soup)
        content = self._extract_content(self.soup)
        publication_date = self._extract_publication_date(self.soup)

        intance = self.model(
            title=title,
            subtitle=subtitle,
            author=author,
            content=content,
            publication_date=publication_date,
            url=article_link,
        )

        intance.save()

    def _extract_title(self, soup: BeautifulSoup):
        title = soup.find(
            "h1",
            class_="base-font-primary dark:base-text-neutral-high-400 base-mb-xxxs base-text-xl base-font-semibold base-leading-xxl lg:base-leading-giant lg:base-text-xxl base-text-neutral-low-500",
        ).text
        return title

    def _extract_author(self, soup: BeautifulSoup):
        first_span = soup.find(
            "span",
            class_="article-text-editorial-color article-ml-quark article-mr-quark dark:!article-text-neutral-high-400",
        )
        author = first_span.find_next("span").text
        return author

    def _extract_subtitle(self, soup: BeautifulSoup):
        subtitle = soup.find(
            "h2",
            class_="base-font-primary dark:base-text-neutral-high-400 base-text-xxs base-font-bold base-leading-md sm:base-text-md sm:base-font-medium sm:base-leading-lg base-text-neutral-low-500",
        ).text
        return subtitle

    def _extract_content(self, soup: BeautifulSoup):
        article = soup.find("article", class_="b-article-body")
        paragraphs = article.find_all_next("span", class_="b-article-body__text")
        content = "\n".join([p.text for p in paragraphs if len(p) > 0])
        return content

    def _extract_publication_date(self, soup: BeautifulSoup):
        publication_date = soup.find("time", itemprop="datePublished")["datetime"]
        modified_date = soup.find("time", itemprop="dateModified")["datetime"]
        if modified_date is None:
            return publication_date
        return modified_date
    

class BandScraper(BaseSeleniumScraper):
    model = Article

    def __init__(self) -> None:
        super().__init__()

    def extract(self, article_link: str, **kwargs) -> None:
        self.driver.get(article_link)
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        if len(self.soup) == 0:
            raise ValueError("No elements scraped from link %s", article_link)
        
        title = self._extract_title(self.soup)
        author = self._extract_author(self.soup)
        subtitle = self._extract_subtitle(self.soup)
        content = self._extract_content(self.soup)
        publication_date = self._extract_publication_date(self.soup)

        intance = self.model(
            title=title,
            subtitle=subtitle,
            author=author,
            content=content,
            publication_date=publication_date,
            url=article_link,
        )

        intance.save()

    def _extract_title(self, soup: BeautifulSoup):
        h1_element = soup.find(
            "h1",
            class_="cs-entry__title",
        )
        title = h1_element.find('span').text
        return title

    def _extract_author(self, soup: BeautifulSoup):
        author = soup.find(
            "span",
            class_="cs-meta-author-name",
        ).text
        return author

    def _extract_subtitle(self, soup: BeautifulSoup):
        subtitle = soup.find(
            "div",
            class_="cs-entry__subtitle",
        ).text
        return subtitle

    def _extract_content(self, soup: BeautifulSoup):
        article = soup.find("div", class_="cs-entry__content-wrap")
        paragraphs = article.find_all("p")
        content = "\n".join([p.text for p in paragraphs if len(p) > 0])
        return content

    def _translate_months(self, date_str: str):
        months_dict = {
            "janeiro": "January", "fevereiro": "February", "março": "March",
            "abril": "April", "maio": "May", "junho": "June",
            "julho": "July", "agosto": "August", "setembro": "September",
            "outubro": "October", "novembro": "November", "dezembro": "December"
        }

        # Traduz o nome do mês para inglês
        for month_pt, month_en in months_dict.items():
            date_str = date_str.replace(month_pt, month_en)

        return date_str


    def _extract_publication_date(self, soup: BeautifulSoup):
        date_string = soup.find("div", class_='cs-meta-date').text
        date_string = self._translate_months(date_string)
        date_format = "%B %d, %Y"
        publication_date = datetime.strptime(date_string, date_format)
        return publication_date

In [None]:
from news_summarizer.scraper import scraper_registry
from news_summarizer.scraper import ScraperExecutor

In [None]:
links = Link.bulk_find(**{})
links = [str(link.url) for link in links]

In [None]:
executor = ScraperExecutor(scraper_registry)

In [None]:
executor.run(links[:20])

INFO:news_summarizer.scraper.executor:Semaphore acquired. Remaining slots: 1
INFO:news_summarizer.scraper.executor:Semaphore acquired. Remaining slots: 0
INFO:news_summarizer.scraper.executor:Starting scraper for link: https://g1.globo.com/mundo/noticia/2024/12/07/rebeldes-na-siria-avancam-cerco-para-20-km-de-damasco-diz-comandante-rebelde.ghtml
INFO:news_summarizer.scraper.executor:Starting scraper for link: https://g1.globo.com/mundo/noticia/2024/12/07/quem-controla-o-que-na-siria-com-o-avanco-repentino-dos-rebeldes.ghtml
INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Get LATEST edgedriver version for Edge 131.0.2903
INFO:WDM:Driver [/home/llya/.wdm/drivers/edgedriver/linux64/131.0.2903.87/msedgedriver] found in cache
INFO:WDM:Driver [/home/llya/.wdm/drivers/edgedriver/linux64/131.0.2903.87/msedgedriver] found in cache
INFO:news_summarizer.scraper

{'https://g1.globo.com/mundo/noticia/2024/12/07/rebeldes-na-siria-avancam-cerco-para-20-km-de-damasco-diz-comandante-rebelde.ghtml': True,
 'https://g1.globo.com/mundo/noticia/2024/12/07/quem-controla-o-que-na-siria-com-o-avanco-repentino-dos-rebeldes.ghtml': True,
 'https://g1.globo.com/carros/noticia/2024/12/07/renault-kwid-fica-mais-caro-e-apenas-4-carros-estao-abaixo-dos-r-80-mil-no-brasil-confira-a-lista.ghtml': True,
 'https://g1.globo.com/mundo/noticia/2024/12/07/papa-francisco-nomeia-21-novos-cardeais-da-igreja-catolica-dom-jaime-spengler-esta-entre-eles.ghtml': True,
 'https://g1.globo.com/mundo/noticia/2024/12/07/cerco-a-damasco-veja-a-cronologia-do-avanco-rebelde-na-siria.ghtml': True,
 'https://g1.globo.com/sp/sao-paulo/noticia/2024/12/07/sp-segue-com-menor-taxa-de-homicidios-do-pais-mas-piora-nos-rankings-nacionais-de-letalidade-policial-estupro-feminicidio-e-latrocinio.ghtml': True,
 'https://g1.globo.com/mundo/noticia/2024/12/07/trump-fara-reuniao-com-macron-e-zelenski-e

In [None]:
collection_article = _database.get_collection("article")
indexes = collection_article.find(**{})
indexes = list(indexes)

In [None]:
indexes

[{'_id': '875fca33-bc9b-4c95-9a47-2d39a9da5565',
  'title': 'Rebeldes, forças do governo, curdos e facções turcas: os grupos que disputam territórios na Síria',
  'subtitle': 'O país vive uma série de crises desde o início da guerra de 2011, quando opositores ao regime de Bashar al-Assad iniciaram protestos por democracia. Entenda como o país está dividido.',
  'author': 'Redação g1',
  'publication_date': datetime.datetime(2024, 12, 7, 15, 30, 57, 9000),
  'content': ' Rebeldes sírios avançam contra o regime de Bashar al-Assad — Foto: Omar Haj Kadour/ AFP \n Grupos rebeldes dominados por islamistas radicais do noroeste da Síria assumiram o controle de dezenas de localidades e duas das principais cidades do país em 10 dias, em uma ofensiva relâmpago que enfraqueceu ainda mais o regime de Bashar al-Assad. \n O avanço, facilitado pela retirada do Exército sírio em várias regiões, marca um ponto de virada na guerra da Síria, que começou em 2011 com a repressão de protestos pró-democracia.

In [None]:
sdfsdf

NameError: name 'sdfsdf' is not defined

In [None]:
sdfsfsdf  1:47

SyntaxError: invalid syntax (4280445439.py, line 1)

In [None]:

# SuperFastPython.com
# example of using a semaphore
from time import sleep
from random import random
from threading import Thread
from threading import Semaphore
 
# target function
def task(semaphore, number):
    # attempt to acquire the semaphore
    with semaphore:
        # process
        value = random()
        sleep(value)
        # report result
        print(f'Thread {number} got {value}')
 
# create a semaphore
semaphore = Semaphore(4)
# create a suite of threads
for i in range(10):
    worker = Thread(target=task, args=(semaphore, i))
    worker.start()
# wait for all workers to complete..

Thread 1 got 0.26979132276078077
Thread 4 got 0.2680321303617894
Thread 0 got 0.7349327955542244
Thread 3 got 0.773502219618814
Thread 2 got 0.8610218647067934
Thread 5 got 0.6249579095979747
Thread 8 got 0.36658455655124955
Thread 9 got 0.1666516001492402
Thread 6 got 0.787973507098034
Thread 7 got 0.7805049959260377


In [None]:
# SuperFastPython.com
# example of bounding the number of tasks submitted to the thread pool
from time import sleep
from random import random
from threading import Semaphore
from concurrent.futures import ThreadPoolExecutor
 
# mock task that sleeps for a moment
def work(identifier):
    sleep(random())
    print(f'Done: {identifier}')
    return True
 
# callback for completed tasks
def task_complete_callback(future):
    global semaphore
    # release the semaphore
    semaphore.release()
 
# proxy for submitting tasks that imposes a limit on the queue size
def submit_proxy(function, *args, **kwargs):
    print('registrando nova tarefa')
    global semaphore, executor
    # acquire the semaphore, blocks if occupied
    semaphore.acquire()
    # submit the task normally
    future = executor.submit(function, *args, **kwargs)
    # add the custom done callback
    future.add_done_callback(task_complete_callback)
    return future
 
# number of works in the pool
n_workers = 2
# max number of queued tasks
n_queue = 10
# semaphore to limit the queue size to the pool
semaphore = Semaphore(n_queue)
# create the thread pool
with ThreadPoolExecutor(n_workers) as executor:
    # submit many tasks
    futures = [submit_proxy(work, i) for i in range(50)]
    # wait for all tasks to complete
    print('All tasks are submitted, waiting...')

registrando nova tarefa
registrando nova tarefa
registrando nova tarefa
registrando nova tarefa
registrando nova tarefa
registrando nova tarefa
registrando nova tarefa
registrando nova tarefa
registrando nova tarefa
registrando nova tarefa
registrando nova tarefa
Done: 1
registrando nova tarefa
Done: 2
registrando nova tarefa
Done: 0
registrando nova tarefa
Done: 3
registrando nova tarefa
Done: 5
registrando nova tarefa
Done: 4
registrando nova tarefa
Done: 6
registrando nova tarefa
Done: 7
registrando nova tarefa
Done: 8
registrando nova tarefa
Done: 9
registrando nova tarefa
Done: 11
registrando nova tarefa
Done: 10
registrando nova tarefa
Done: 12
registrando nova tarefa
Done: 14
registrando nova tarefa
Done: 13
registrando nova tarefa
Done: 15
registrando nova tarefa
Done: 16
registrando nova tarefa
Done: 18
registrando nova tarefa
Done: 17
registrando nova tarefa
Done: 20
registrando nova tarefa
Done: 21
registrando nova tarefa
Done: 19
registrando nova tarefa
Done: 23
registrando

In [None]:
from concurrent.futures import as_completed

In [None]:
for future in as_completed(futures):
    print(future.result())

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [None]:
""" new_scraper = scraper_registry.get('https://noticias.r7.com/brasilia/comissao-do-senado-sabatina-indicados-de-lula-ao-bc-na-proxima-terca-feira-05122024/') """

" new_scraper = scraper_registry.get('https://noticias.r7.com/brasilia/comissao-do-senado-sabatina-indicados-de-lula-ao-bc-na-proxima-terca-feira-05122024/') "

In [None]:
""" new_scraper.extract('https://noticias.r7.com/brasilia/comissao-do-senado-sabatina-indicados-de-lula-ao-bc-na-proxima-terca-feira-05122024/') """

" new_scraper.extract('https://noticias.r7.com/brasilia/comissao-do-senado-sabatina-indicados-de-lula-ao-bc-na-proxima-terca-feira-05122024/') "

In [None]:
""" scraper = R7Scraper()
scraper.extract(article_link='https://noticias.r7.com/brasilia/comissao-do-senado-sabatina-indicados-de-lula-ao-bc-na-proxima-terca-feira-05122024/') """

""" scraper = G1Scraper()
scraper.extract(article_link="https://g1.globo.com/saude/noticia/2024/12/04/o-que-acontece-com-o-corpo-quando-a-vitamina-d-esta-baixa.ghtml") """

""" scraper = BandScraper()
scraper.extract(article_link='https://bandnewstv.uol.com.br/2024/12/05/mauro-cid-vai-prestar-novo-depoimento-a-policia-federal-nesta-quinta-feira/') """

" scraper = BandScraper()\nscraper.extract(article_link='https://bandnewstv.uol.com.br/2024/12/05/mauro-cid-vai-prestar-novo-depoimento-a-policia-federal-nesta-quinta-feira/') "