In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import time
from bs4 import BeautifulSoup
from news_summarizer.webdriver import WebDriverFactory, ShutilBrowserLocator
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = 'https://g1.globo.com/ultimas-noticias/'


In [3]:
from abc import ABC, abstractmethod
import numpy as np


class BaseCrawler(ABC):
    @abstractmethod
    def extract(self, link: str, **kwargs) -> None:
        raise NotImplementedError

class BaseSeleniumCrawler(BaseCrawler, ABC):
    def __init__(self, scroll_limit: int = 5) -> None:
        self.driver = WebDriverFactory(ShutilBrowserLocator()).get_webdriver() 
        self.scroll_limit = scroll_limit
        self.soup = None

    def scroll_page(self) -> None:
        current_scroll = 0
        veja_mais_count = 0
        last_page_number = 0
        page_number = 0

        while True:
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(np.random.randint(2, 5))            
            # Wait for the "Veja mais" link to appear with the next page number
            try:
    
                see_more_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "/html/body/div[2]/main/div[3]/div[2]/div/div/div/div/div/div/div/div[3]/a"))
                )
                href = see_more_button.get_attribute('href')
                page_number = self._extract_page_number(href)

                print("Page number: ", page_number)

                print(f"see_more found {veja_mais_count} times: {href}")
                
                if page_number > last_page_number:
                    veja_mais_count += 1
                    last_page_number = page_number
                    
                    if veja_mais_count >= 6:
                        break
                see_more_button.click()
            except Exception as e:
                print("Veja mais link not found yet, scrolling more...")

            print("Current iteration: ", current_scroll)
            
            
    def _extract_page_number(self, url):
        match = re.search(r'pagina-(\d+)', url)
        if match:
            return int(match.group(1))
        return None

class G1Crawler(BaseSeleniumCrawler):
    def __init__(self, scroll_limit: int = 5) -> None:
        super().__init__(scroll_limit=scroll_limit)
        self.links = None
    
    def extract(self, link: str, **kwargs) -> None:
        self.driver.get(link)
        time.sleep(5)
        self.scroll_page()
        self.soup = BeautifulSoup(self.driver.page_source, "html.parser")
        self.links = self.soup.find_all('a', class_='feed-post-link gui-color-primary gui-color-hover', href=True) # self.soup.find_all('a', href=True)


In [4]:
my_crawler = G1Crawler()
my_crawler.extract(link=URL)

Creating the instance
Page number:  4
see_more found 0 times: https://g1.globo.com/ultimas-noticias/index/feed/pagina-4.ghtml
Veja mais link not found yet, scrolling more...
Current iteration:  0
Page number:  5
see_more found 1 times: https://g1.globo.com/ultimas-noticias/index/feed/pagina-5.ghtml
Current iteration:  0
Page number:  6
see_more found 2 times: https://g1.globo.com/ultimas-noticias/index/feed/pagina-6.ghtml
Current iteration:  0
Page number:  7
see_more found 3 times: https://g1.globo.com/ultimas-noticias/index/feed/pagina-7.ghtml
Current iteration:  0
Page number:  7
see_more found 4 times: https://g1.globo.com/ultimas-noticias/index/feed/pagina-7.ghtml
Current iteration:  0
Page number:  8
see_more found 4 times: https://g1.globo.com/ultimas-noticias/index/feed/pagina-8.ghtml
Current iteration:  0
Page number:  9
see_more found 5 times: https://g1.globo.com/ultimas-noticias/index/feed/pagina-9.ghtml


In [5]:
my_crawler.links

[<a class="feed-post-link gui-color-primary gui-color-hover" href="https://g1.globo.com/sp/sao-paulo/noticia/2024/11/25/2-em-cada-10-mulheres-ja-foram-ameacadas-de-morte-pelo-parceiro-namorado-ou-ex-diz-pesquisa.ghtml"><p elementtiming="text-csr">2 em cada 10 mulheres já foram ameaçadas de morte pelo parceiro, namorado ou ex, diz pesquisa</p></a>,
 <a class="feed-post-link gui-color-primary gui-color-hover" href="https://g1.globo.com/trabalho-e-carreira/noticia/2024/11/25/13o-salario-parcela-unica-ou-1a-parte-devem-ser-pagas-ate-sexta-feira.ghtml"><p elementtiming="text-csr">13º salário: parcela única ou 1ª parte devem ser pagas até sexta-feira; veja o que fazer se você não receber</p></a>,
 <a class="feed-post-link gui-color-primary gui-color-hover" href="https://g1.globo.com/politica/noticia/2024/11/25/inquerito-do-golpe-apos-receber-relatorio-da-pf-cabera-a-pgr-definir-destino-da-investigacao-contra-bolsonaro-e-aliados.ghtml"><p elementtiming="text-csr">Inquérito do golpe: após rece

In [6]:
""" driver.get(URL)
time.sleep(10)
current_scroll = 0
last_height = driver.execute_script("return document.body.scrollHeight")
veja_mais_count = 0
last_page_number = 0

while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    
    # Wait for the "Veja mais" link to appear with the next page number
    try:
        veja_mais_link = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f"//div[@class='load-more gui-color-primary-bg']/a[contains(@href, 'pagina-')]"))
        )
        href = veja_mais_link.get_attribute('href')
        page_number = extract_page_number(href)
        print("Page number: ", page_number)
        print(f"Veja mais link found {veja_mais_count} times: {href}")
        
        if page_number > last_page_number:
            veja_mais_count += 1
            last_page_number = page_number
            
            if veja_mais_count >= 6:
                break
        veja_mais_link.click()
    except Exception as e:
        print("Veja mais link not found yet, scrolling more...")

    new_height = driver.execute_script("return document.body.scrollHeight")
    print(current_scroll, new_height)
    #if new_height == last_height or (5 and current_scroll >= 5):
    #    break
    last_height = new_height
    current_scroll += 1 """

' driver.get(URL)\ntime.sleep(10)\ncurrent_scroll = 0\nlast_height = driver.execute_script("return document.body.scrollHeight")\nveja_mais_count = 0\nlast_page_number = 0\n\nwhile True:\n    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")\n    time.sleep(2)\n    \n    # Wait for the "Veja mais" link to appear with the next page number\n    try:\n        veja_mais_link = WebDriverWait(driver, 10).until(\n            EC.presence_of_element_located((By.XPATH, f"//div[@class=\'load-more gui-color-primary-bg\']/a[contains(@href, \'pagina-\')]"))\n        )\n        href = veja_mais_link.get_attribute(\'href\')\n        page_number = extract_page_number(href)\n        print("Page number: ", page_number)\n        print(f"Veja mais link found {veja_mais_count} times: {href}")\n        \n        if page_number > last_page_number:\n            veja_mais_count += 1\n            last_page_number = page_number\n            \n            if veja_mais_count >= 6:\n           