In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from news_summarizer.domain.base.nosql import _database

collection_link = _database.get_collection("link")
indexes = collection_link.find({"source": "https://g1.globo.com/"})
indexes = list(indexes)

In [None]:
from news_summarizer.crawler.registry import crawler_registry


from typing import List
from news_summarizer.crawler.registry import CrawlerRegistry
from concurrent.futures import ThreadPoolExecutor


import logging
from concurrent.futures import ThreadPoolExecutor
from typing import List

logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)


class CrawlerExecutor:
    def __init__(self, crawler_registry: CrawlerRegistry):
        self.crawler_registry = crawler_registry

    def run(self, links: List[str]):
        with ThreadPoolExecutor(max_workers=len(links)) as executor:
            # Map each link to its corresponding crawler using the registry
            futures = [executor.submit(self._run_crawler, link) for link in links]
            for future in futures:
                try:
                    future.result()  # Wait for the crawler to complete
                except Exception as e:
                    logger.error(f"Error occurred during crawling: {e}")

    def _run_crawler(self, link: str):
        logger.info(f"Starting crawler for link: {link}")

        # Use the registry to select the appropriate crawler based on the link
        crawler_cls = self.crawler_registry.get(link)
        if not crawler_cls:
            logger.error(f"No crawler registered for link: {link}")
            return

        crawler = crawler_cls()
        crawler.search(link)
        logger.info(f"Finished crawling for link: {link}")


if __name__ == "__main__":
    links = [
        "https://www.r7.com/",
        "https://g1.globo.com/",
        "https://bandnewstv.uol.com.br/",
    ]

    # Initialize CrawlerExecutor with the registry
    executor = CrawlerExecutor(crawler_registry)
    executor.run(links)