In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import time
import numpy as np
from bs4 import BeautifulSoup
from news_summarizer.webdriver import WebDriverFactory, ShutilBrowserLocator
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [3]:
from abc import ABC, abstractmethod


class BaseCrawler(ABC):
    @abstractmethod
    def search(self, link: str, **kwargs) -> None:
        raise NotImplementedError


class BaseSeleniumCrawler(BaseCrawler, ABC):
    def __init__(self, scroll_limit: int = 5) -> None:
        self.driver = WebDriverFactory(ShutilBrowserLocator()).get_webdriver()
        self.scroll_limit = scroll_limit
        self.soup = None


class G1Crawler(BaseSeleniumCrawler):
    def __init__(self, scroll_limit: int = 5) -> None:
        super().__init__(scroll_limit=scroll_limit)
        self.links = None

    def scroll_page(self) -> None:
        load_mode = 0
        page_number = 0
        last_page_number = 0

        while True:
            self.driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);"
            )
            time.sleep(np.random.randint(2, 5))
            # Wait for the "Veja mais" link to appear with the next page number
            try:

                load_more_link = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "div.load-more a"))
                )

                url = load_more_link.get_dom_attribute("href")
                page_number = self._extract_page_number(url)

                if page_number > last_page_number:
                    load_mode += 1
                    last_page_number = page_number

                    if load_mode >= 6:
                        break
                load_more_link.click()
            except Exception as e:
                print("see more link not found yet, scrolling more...")

    def _extract_page_number(self, url):
        match = re.search(r"pagina-(\d+)", url)
        if match:
            return int(match.group(1))
        return None

    def search(self, link: str, **kwargs) -> None:
        self.driver.get(link)
        time.sleep(5)
        self.scroll_page()
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        self.links = self.soup.find_all(
            "a", href=True
        )  # self.soup.find_all('a', href=True)

        self.driver.close()


class BandCrawler(BaseSeleniumCrawler):
    def __init__(self, scroll_limit: int = 5) -> None:
        super().__init__(scroll_limit=scroll_limit)
        self.links = None

    def scroll_page(self) -> None:
        load_more_count = 0

        last_height = self.driver.execute_script("return document.body.scrollHeight")
        while True:

            self.driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);"
            )
            time.sleep(np.random.randint(2, 5))

            try:
                load_more_link = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, "div.jeg_block_loadmore a")
                    )
                )

                load_more_link.click()

                # get the current document Height
                new_height = self.driver.execute_script(
                    "return document.body.scrollHeight"
                )

                if new_height > last_height:
                    load_more_count += 1
                    last_height = new_height
                    if load_more_count >= 6:
                        break
            except Exception as e:
                print("Veja mais link not found yet, scrolling more...")

    def search(self, link: str, **kwargs) -> None:
        self.driver.get(link)
        time.sleep(5)
        self.scroll_page()
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        self.links = self.soup.find_all(
            "a", href=True
        )

class R7Crawler(BaseSeleniumCrawler):
    def __init__(self, scroll_limit: int = 5) -> None:
        super().__init__(scroll_limit=scroll_limit)
        self.links = None

    def scroll_page(self) -> None:
        """Scroll through the LinkedIn page based on the scroll limit."""
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        current_scroll = 0
        while True:
            self.driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);"
            )
            time.sleep(5)
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height or (
                self.scroll_limit and current_scroll >= self.scroll_limit
            ):
                break
            last_height = new_height
            current_scroll += 1

    def search(self, link: str, **kwargs) -> None:
        self.driver.get(link)
        time.sleep(5)
        self.scroll_page()
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        self.links = self.soup.find_all(
            "a", href=True
        )  # self.soup.find_all('a', href=True)

In [4]:
g1_crawler = G1Crawler()
g1_crawler.search(link='https://g1.globo.com')

see more link not found yet, scrolling more...


In [5]:
g1_crawler.soup.find_all('a')[25].text, g1_crawler.soup.find_all('a')[25].get('href')

("Lira critica indiciamento de deputados e diz ver com 'preocupação' ações da PF",
 'https://g1.globo.com/politica/noticia/2024/11/27/lira-critica-indiciamento-de-deputados-e-diz-ver-com-preocupacao-acoes-da-pf-contra-parlamentares.ghtml')

In [6]:
{'text': g1_crawler.links[20].text, 'link': g1_crawler.links[20].get('href')}

{'text': '',
 'link': 'https://g1.globo.com/politica/noticia/2024/11/27/congresso-espera-flavio-dino-liberar-emendas-para-destravar-votacao-do-orcamento-dizem-senadores.ghtml'}

In [7]:
band_crawler = BandCrawler()
band_crawler.search(link='https://bandnewstv.uol.com.br')

Veja mais link not found yet, scrolling more...
Veja mais link not found yet, scrolling more...


In [8]:
band_crawler.soup.find_all('a')[50].text, band_crawler.soup.find_all('a')[50].get('href')

('Notícias', 'https://bandnewstv.uol.com.br/category/noticias/')

In [20]:
band_crawler.soup.find_all('a')[80].text, band_crawler.soup.find_all('a')[80].get('href')

('Botafogo, Palmeiras, Inter e Fortaleza fazem contas por título',
 'https://bandnewstv.uol.com.br/2024/11/27/botafogo-palmeiras-inter-e-fortaleza-fazem-contas-por-titulo/')

In [9]:
r7_crawler = R7Crawler()
r7_crawler.search(link='https://www.r7.com')

In [10]:
band_crawler.soup.find_all('a')[50].text, band_crawler.soup.find_all('a')[50].get('href')

('Notícias', 'https://bandnewstv.uol.com.br/category/noticias/')