openzim · benoit74 · Jul 30, 2024 · Jul 11, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,5 +1,6 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
+exclude: ^tests/files # these are raw test files, no need to mess with them
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v4.5.0

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Add utility function to compute ZIM Tags #164, including deduplication #156
 - Metadata does not automatically drops control characters #159
+- New `indexing.IndexData` class to hold title, content and keywords to pass to libzim to index an item
+- Automatically index PDF documents content #167
+- Automatically set proper title on PDF documents #168
 
 ### Fixed
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,10 +18,11 @@ dependencies = [
   "beautifulsoup4>=4.9.3,<5.0",
   "lxml>=4.6.3,<6.0",
   "optimize-images>=1.3.6,<2.0",
-  # regex has nNo upper-bound due to "date-based" release numbers, no semver, so their
+  # regex has no upper-bound due to "date-based" release numbers, no semver, so their
   # promise is that they will never (or always) break the API, and the API is very
   # limited and we use only a very small subset of it.
   "regex>=2020.7.14",
+  "pymupdf>=1.24.0,<2.0",
   # youtube-dl should be updated as frequently as possible
   "yt-dlp"
 ]

diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py
@@ -45,6 +45,7 @@
 )
 from zimscraperlib.i18n import is_valid_iso_639_3
 from zimscraperlib.types import get_mime_for_name
+from zimscraperlib.zim.indexing import IndexData
 from zimscraperlib.zim.items import StaticItem
 from zimscraperlib.zim.metadata import (
     validate_counter,
@@ -340,6 +341,9 @@ def add_item_for(
         delete_fpath: bool | None = False,  # noqa: FBT002
         duplicate_ok: bool | None = None,
         callback: Callable | tuple[Callable, Any] | None = None,
+        index_data: IndexData | None = None,
+        *,
+        auto_index: bool = True,
     ):
         """Add a File or content at a specified path and get its path
 
@@ -388,6 +392,8 @@ def add_item_for(
                 filepath=fpath,
                 hints=hints,
                 content=content,
+                index_data=index_data,
+                auto_index=auto_index,
             ),
             callback=callback,
             duplicate_ok=duplicate_ok,

diff --git a/src/zimscraperlib/zim/indexing.py b/src/zimscraperlib/zim/indexing.py
@@ -0,0 +1,119 @@
+""" Special item with customized index data and helper classes """
+
+from __future__ import annotations
+
+import io
+import pathlib
+
+import libzim.writer  # pyright: ignore
+
+try:
+    import pymupdf
+except ImportError:  # pragma: no cover
+    import fitz as pymupdf  # pymupdf main module was named fitz before 1.24.3
+
+from zimscraperlib import logger
+
+
+class IndexData(libzim.writer.IndexData):
+    """IndexData to properly pass indexing title and content to the libzim
+
+    Both title and content have to be customized (title can be identical to item title
+    or not).
+    keywords is optional since it can be empty
+    wordcount is optional ; if not passed, it is automaticaly computed from content
+    """
+
+    def __init__(
+        self, title: str, content: str, keywords: str = "", wordcount: int | None = None
+    ):
+        # set wordcount first so that we know if we should override it based on content
+        self.wordcount = wordcount
+        self.title = title
+        self.content = content
+        self.keywords = keywords
+
+    def has_indexdata(self) -> bool:
+        return len(self.content) > 0 or len(self.title) > 0
+
+    def get_title(self) -> str:
+        return self.title
+
+    def get_content(self) -> str:
+        return self.content
+
+    def get_keywords(self) -> str:
+        return self.keywords
+
+    def get_wordcount(self) -> int:
+        return self.wordcount or 0
+
+    @property
+    def content(self):
+        return self._content
+
+    @content.setter
+    def content(self, value: str):
+        self._content = value
+        if not self.wordcount:
+            self.wordcount = len(self.content.split()) if self.content else 0
+
+
+IGNORED_MUPDF_MESSAGES = [
+    "lcms: not an ICC profile, invalid signature.",
+    "format error: cmsOpenProfileFromMem failed",
+    "ignoring broken ICC profile",
+]
+
+
+def get_pdf_index_data(
+    *,
+    content: str | bytes | None = None,
+    fileobj: io.BytesIO | None = None,
+    filepath: pathlib.Path | None = None,
+) -> IndexData:
+    """Returns the IndexData information for a given PDF
+
+    PDF can be passed either as content or fileobject or filepath
+    """
+
+    # do not display all pymupdf errors, we will filter them afterwards
+    pymupdf.TOOLS.mupdf_display_errors(False)
+
+    if content:
+        doc = pymupdf.open(stream=content)
+    elif fileobj:
+        doc = pymupdf.open(stream=fileobj)
+    else:
+        doc = pymupdf.open(filename=filepath)
+    metadata = doc.metadata
+    title = ""
+    if metadata:  # pragma: no branch (always metadata in test PDFs)
+        parts = []
+        for key in ["title", "author", "subject"]:
+            if metadata.get(key):
+                parts.append(metadata[key])
+        if parts:  # pragma: no branch (always metadata in test PDFs)
+            title = " - ".join(parts)
+
+    content = "\n".join(
+        page.get_text() for page in doc  # pyright: ignore[reportAttributeAccessIssue]
+    )
+
+    # build list of messages and filter messages which are known to not be relevant
+    # in our use-case
+    mupdf_messages = "\n".join(
+        warning
+        for warning in pymupdf.TOOLS.mupdf_warnings().splitlines()
+        if warning not in IGNORED_MUPDF_MESSAGES
+    )
+
+    if mupdf_messages:
+        logger.warning(
+            f"PyMuPDF issues:\n{mupdf_messages}"
+        )  # pragma: no cover (no known error in test PDFs)
+
+    return IndexData(
+        title=title,
+        content=content,
+    )
diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py
@@ -15,6 +15,8 @@
 import libzim.writer  # pyright: ignore
 
 from zimscraperlib.download import stream_file
+from zimscraperlib.filesystem import get_content_mimetype, get_file_mimetype
+from zimscraperlib.zim.indexing import IndexData, get_pdf_index_data
 from zimscraperlib.zim.providers import (
     FileLikeProvider,
     FileProvider,
@@ -69,7 +71,17 @@ class StaticItem(Item):
     Sets a `ref` to itself on the File/String content providers so it outlives them
     We need Item to survive its ContentProvider so that we can track lifecycle
     more efficiently: now when the libzim destroys the CP, python will destroy
-    the Item and we can be notified that we're effectively through with our content"""
+    the Item and we can be notified that we're effectively through with our content
+
+    By default, content is automatically indexed (either by the libzim itself for
+    supported documents - text or html for now or by the python-scraperlib - only PDF
+    supported for now). If you do not want this, set `auto_index` to False to disable
+    both indexing (libzim and python-scraperlib).
+
+    It is also possible to pass index_data to configure custom indexing of the item.
+
+    If item title is not set by caller, it is automatically populated from index_data.
+    """
 
     def __init__(
         self,
@@ -80,6 +92,9 @@ def __init__(
         title: str | None = None,
         mimetype: str | None = None,
         hints: dict | None = None,
+        index_data: IndexData | None = None,
+        *,
+        auto_index: bool = True,
         **kwargs: Any,
     ):
         if content is not None:
@@ -91,6 +106,20 @@ def __init__(
         super().__init__(
             path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
         )
+        if index_data:
+            self.get_indexdata = lambda: index_data
+        elif not auto_index:
+            self.get_indexdata = lambda: IndexData("", "")  # index nothing
+        else:
+            self._get_auto_index()  # consider to add auto index
+
+        # Populate item title from index data if title is not set by caller
+        if (
+            (not hasattr(self, "title") or not self.title)
+            and hasattr(self, "get_indexdata")
+            and self.get_indexdata().get_title()
+        ):
+            self.title = self.get_indexdata().get_title()
 
     def get_contentprovider(self) -> libzim.writer.ContentProvider:
         # content was set manually
@@ -116,6 +145,53 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider:
 
         raise NotImplementedError("No data to provide`")
 
+    def _get_auto_index(self):
+        """Populate item index data and title automatically from content"""
+
+        # content was set manually
+        content = getattr(self, "content", None)
+        if content is not None:
+            if not isinstance(content, (str, bytes)):
+                raise RuntimeError(
+                    f"Unexpected type for content: {type(content)}"
+                )  # pragma: no cover
+            mimetype = get_content_mimetype(
+                content.encode("utf-8") if isinstance(content, str) else content
+            )
+            if mimetype == "application/pdf":
+                index_data = get_pdf_index_data(content=content)
+                self.get_indexdata = lambda: index_data
+            else:
+                return
+
+        # using a file-like object
+        fileobj = getattr(self, "fileobj", None)
+        if fileobj:
+            if not isinstance(fileobj, io.BytesIO):
+                raise RuntimeError(
+                    f"Unexpected type for content: {type(fileobj)}"
+                )  # pragma: no cover
+            mimetype = get_content_mimetype(fileobj.getvalue())
+            if mimetype == "application/pdf":
+                index_data = get_pdf_index_data(fileobj=fileobj)
+                self.get_indexdata = lambda: index_data
+            else:
+                return
+
+        # using a file path
+        filepath = getattr(self, "filepath", None)
+        if filepath:
+            if not isinstance(filepath, pathlib.Path):
+                raise RuntimeError(
+                    f"Unexpected type for content: {type(filepath)}"
+                )  # pragma: no cover
+            mimetype = get_file_mimetype(filepath)
+            if mimetype == "application/pdf":
+                index_data = get_pdf_index_data(filepath=filepath)
+                self.get_indexdata = lambda: index_data
+            else:
+                return
+
 
 class URLItem(StaticItem):
     """StaticItem to automatically fetch and feed an URL resource

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -132,6 +132,32 @@ def webp_image():
     return file_src("ninja.webp")
 
 
+@pytest.fixture(scope="module")
+def encrypted_pdf_file():
+    """Return an encrypted PDF
+
+    encrypted.pdf is a PDF encrypted with only a owner password (restricting edit/print)
+    we want to be sure we are capable to also index this kind of PDF documents, since
+    they are readable by most popular readers without any issue (view is unrestricted).
+    """
+    return file_src("encrypted.pdf")
+
+
+@pytest.fixture(scope="module")
+def encrypted_pdf_content():
+    return file_src("encrypted.txt")
+
+
+@pytest.fixture(scope="module")
+def big_pdf_file():
+    return file_src("milderm.pdf")
+
+
+@pytest.fixture(scope="module")
+def big_pdf_content():
+    return file_src("milderm.txt")
+
+
 @pytest.fixture(scope="module")
 def valid_user_agent():
     return "name/version (contact)"

diff --git a/tests/files/encrypted.pdf b/tests/files/encrypted.pdf
diff --git a/tests/files/encrypted.txt b/tests/files/encrypted.txt
@@ -0,0 +1,3 @@
+Placeholder Documentation 
+This document is a placeholder for the appropriate documentation. 
+
diff --git a/tests/files/milderm.pdf b/tests/files/milderm.pdf
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Placeholder Documentation
		This document is a placeholder for the appropriate documentation.