In [7]:
# main.py
import json
from dotenv import load_dotenv

# Internal imports
from indexer_ze import ZeroEntropyArticleIndexer
from search_ze import ZeroEntropyArticleSearcher
from utils_ze import ZeroEntropyUtils
from logger import getLogger

# Load environment variables
load_dotenv()

# Configure logger to display log messages
logger = getLogger()



In [8]:
class ZeroEntropyArticleManager:
    """
    Main class that orchestrates RSS scraping, indexing, and searching using ZeroEntropy.
    """

    def __init__(self, collection_name: str = "articles"):
        self.collection_name = collection_name
        self.indexer = ZeroEntropyArticleIndexer(collection_name)
        self.searcher = ZeroEntropyArticleSearcher(collection_name)
        self.utils = ZeroEntropyUtils(collection_name)

    async def scrape_and_index(self):
        """Scrape RSS feeds and index articles"""
        # Initialize RSS feed URLs
        rss_public_urls = [
            "https://www.public.fr/feed",
            "https://www.public.fr/people/feed",
            "https://www.public.fr/tele/feed",
            "https://www.public.fr/mode/feed",
            "https://www.public.fr/people/familles-royales/feed",
        ]

        rss_vsd_urls = [
            "https://vsd.fr/actu-people/feed/",
            "https://vsd.fr/tele/feed/",
            "https://vsd.fr/societe/feed/",
            "https://vsd.fr/culture/feed/",
            "https://vsd.fr/loisirs/feed/",
        ]

        # Initialize collection
        await self.indexer.initialize_collection()

        # Extract content from RSS feeds
        articles = []
        for url in rss_public_urls + rss_vsd_urls:
            content = self.indexer.get_rss_feed_content(url)
            if content:
                articles.extend(content)
                logger.info("Successfully extracted content from %s", url)
            else:
                logger.warning("Failed to extract content from %s", url)

        # Save all content to a JSON file for backup
        with open("articles.json", "w", encoding="utf-8") as f:
            json.dump(articles, f, ensure_ascii=False, indent=4)

        logger.info(f"Extracted {len(articles)} articles total")

        # Index articles in ZeroEntropy
        if articles:
            await self.indexer.index_articles(articles)
            logger.info("Successfully scraped and indexed articles in ZeroEntropy.")
        else:
            logger.warning("No articles to index.")

    async def search_articles(
        self,
        query: str,
        search_type: str = "documents",
        k: int = 10,
        filter_creator: str = None,
        filter_category: str = None,
        reranker: str = "zerank-1-small",
        show_status: bool = False,
    ):
        """Search for articles"""
        # Show status if requested
        if show_status:
            await self.searcher.get_collection_status()

        # Prepare filter if specified
        filter_dict = {}
        if filter_creator:
            filter_dict["creator"] = {"$eq": filter_creator}
        if filter_category:
            filter_dict["categories"] = {"$eq": filter_category}

        filter_dict = filter_dict if filter_dict else None

        # Perform search based on type
        if search_type == "documents":
            results = await self.searcher.search_documents(
                query=query,
                k=k,
                filter_dict=filter_dict,
                reranker=reranker,
            )
            self.searcher.display_document_results(results, query)

        elif search_type == "snippets":
            results = await self.searcher.search_snippets(
                query=query,
                k=k,
                filter_dict=filter_dict,
                reranker=reranker,
            )
            self.searcher.display_snippet_results(results, query)

        elif search_type == "pages":
            results = await self.searcher.search_pages(
                query=query, k=k, filter_dict=filter_dict
            )
            self.searcher.display_page_results(results, query)

        elif search_type == "advanced":
            results = await self.utils.search_and_rerank(
                query=query, k=k * 2, rerank_top_n=k
            )
            self.utils.display_advanced_results(results, query)

        return results

    async def manage_collections(self, action: str, collection_name: str = None):
        """Manage collections (list, delete, status)"""
        if action == "list":
            collections = await self.utils.list_all_collections()
            print(f"Available collections: {collections}")
            return collections

        elif action == "delete" and collection_name:
            success = await self.utils.delete_collection(collection_name)
            if success:
                print(f"Successfully deleted collection: {collection_name}")
            else:
                print(f"Failed to delete collection: {collection_name}")
            return success

        elif action == "status":
            status = await self.searcher.get_collection_status()
            return status

        else:
            print("Invalid action or missing collection name")
            return None


In [9]:
collection = "my_articles"
manager = ZeroEntropyArticleManager(collection)

In [None]:
await manager.scrape_and_index()

[32m2025-08-06 10:30:52[0m | [34mindexer_ze[0m | [1;35mERROR   [0m | [31mCollection 'my_articles' already exists[0m [indexer_ze.py:40]
[32m2025-08-06 10:31:04[0m | [34m1820264054[0m | [1;35mINFO    [0m | [37mSuccessfully extracted content from https://www.public.fr/feed[0m [1820264054.py:40]
[32m2025-08-06 10:31:06[0m | [34m1820264054[0m | [1;35mINFO    [0m | [37mSuccessfully extracted content from https://www.public.fr/people/feed[0m [1820264054.py:40]


In [None]:
query = "famille royale"
search_type = "documents"
k = 10
reranker = "zerank-1-small"

manager = ZeroEntropyArticleManager(collection)
await manager.search_articles(
    query=query,
    search_type=search_type,
    k=k,
    reranker=reranker,
)


DOCUMENT SEARCH RESULTS FOR: 'famille royale'
Found 10 results

Result 1
Document Path: article_204_2420503972517589745
Relevance Score: 1.1434
Title: Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu
Author: Elisabeth Sall
Publication Date: Tue, 22 Jul 2025 07:45:00 +0000
Categories: People, Royauté, Famille royale britannique, Mort, Prince Harry
Source URL: https://www.public.fr/people/familles-royales/feed
File URL: [Available - hidden for security]

--------------------------------------------------

Result 2
Document Path: article_244_4511532550014461900
Relevance Score: 1.0142
Title: Prince Harry: "Les médecins ont recommandé au roi de ne plus parler à son fils", Stéphane Bern s’en mêle
Author: Clément Garin
Publication Date: Sat, 03 May 2025 07:50:00 +0000
Categories: People, Royauté, Cancer, Clash, Famille royale britannique
Source URL: https://www.public.fr/people/familles-royales/feed
File URL: [Available - hidden for securi

[{'path': 'article_204_2420503972517589745',
  'score': 1.1434071251552826,
  'file_url': '[Available - hidden for security]',
  'metadata': {'type': 'rss_article',
   'title': 'Prince Harry et Prince William : leur cousine de 20 ans retrouvée morte à proximité d’une arme à feu',
   'creator': 'Elisabeth Sall',
   'pub_date': 'Tue, 22 Jul 2025 07:45:00 +0000',
   'categories': 'People, Royauté, Famille royale britannique, Mort, Prince Harry',
   'source_url': 'https://www.public.fr/people/familles-royales/feed'}},
 {'path': 'article_244_4511532550014461900',
  'score': 1.0141609646223106,
  'file_url': 'Available - hidden for security',
  'metadata': {'type': 'rss_article',
   'title': 'Prince Harry: "Les médecins ont recommandé au roi de ne plus parler à son fils", Stéphane Bern s’en mêle',
   'creator': 'Clément Garin',
   'pub_date': 'Sat, 03 May 2025 07:50:00 +0000',
   'categories': 'People, Royauté, Cancer, Clash, Famille royale britannique',
   'source_url': 'https://www.public.