Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: ^tests/files # these are raw test files, no need to mess with them
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Add utility function to compute ZIM Tags #164, including deduplication #156
- Metadata does not automatically drops control characters #159
- New `indexing.IndexData` class to hold title, content and keywords to pass to libzim to index an item
- Automatically index PDF documents content #167
- Automatically set proper title on PDF documents #168

### Fixed

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ dependencies = [
"beautifulsoup4>=4.9.3,<5.0",
"lxml>=4.6.3,<6.0",
"optimize-images>=1.3.6,<2.0",
# regex has nNo upper-bound due to "date-based" release numbers, no semver, so their
# regex has no upper-bound due to "date-based" release numbers, no semver, so their
# promise is that they will never (or always) break the API, and the API is very
# limited and we use only a very small subset of it.
"regex>=2020.7.14",
"pymupdf>=1.24.0,<2.0",
# youtube-dl should be updated as frequently as possible
"yt-dlp"
]
Expand Down
6 changes: 6 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
)
from zimscraperlib.i18n import is_valid_iso_639_3
from zimscraperlib.types import get_mime_for_name
from zimscraperlib.zim.indexing import IndexData
from zimscraperlib.zim.items import StaticItem
from zimscraperlib.zim.metadata import (
validate_counter,
Expand Down Expand Up @@ -340,6 +341,9 @@ def add_item_for(
delete_fpath: bool | None = False, # noqa: FBT002
duplicate_ok: bool | None = None,
callback: Callable | tuple[Callable, Any] | None = None,
index_data: IndexData | None = None,
*,
auto_index: bool = True,
):
"""Add a File or content at a specified path and get its path

Expand Down Expand Up @@ -388,6 +392,8 @@ def add_item_for(
filepath=fpath,
hints=hints,
content=content,
index_data=index_data,
auto_index=auto_index,
),
callback=callback,
duplicate_ok=duplicate_ok,
Expand Down
119 changes: 119 additions & 0 deletions src/zimscraperlib/zim/indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
""" Special item with customized index data and helper classes """

from __future__ import annotations

import io
import pathlib

import libzim.writer # pyright: ignore

try:
import pymupdf
except ImportError: # pragma: no cover
import fitz as pymupdf # pymupdf main module was named fitz before 1.24.3

from zimscraperlib import logger


class IndexData(libzim.writer.IndexData):
"""IndexData to properly pass indexing title and content to the libzim

Both title and content have to be customized (title can be identical to item title
or not).
keywords is optional since it can be empty
wordcount is optional ; if not passed, it is automaticaly computed from content
"""

def __init__(
self, title: str, content: str, keywords: str = "", wordcount: int | None = None
):
# set wordcount first so that we know if we should override it based on content
self.wordcount = wordcount
self.title = title
self.content = content
self.keywords = keywords

def has_indexdata(self) -> bool:
return len(self.content) > 0 or len(self.title) > 0

def get_title(self) -> str:
return self.title

def get_content(self) -> str:
return self.content

def get_keywords(self) -> str:
return self.keywords

def get_wordcount(self) -> int:
return self.wordcount or 0

@property
def content(self):
return self._content

@content.setter
def content(self, value: str):
self._content = value
if not self.wordcount:
self.wordcount = len(self.content.split()) if self.content else 0


IGNORED_MUPDF_MESSAGES = [
"lcms: not an ICC profile, invalid signature.",
"format error: cmsOpenProfileFromMem failed",
"ignoring broken ICC profile",
]


def get_pdf_index_data(
*,
content: str | bytes | None = None,
fileobj: io.BytesIO | None = None,
filepath: pathlib.Path | None = None,
) -> IndexData:
"""Returns the IndexData information for a given PDF

PDF can be passed either as content or fileobject or filepath
"""

# do not display all pymupdf errors, we will filter them afterwards
pymupdf.TOOLS.mupdf_display_errors(False)

if content:
doc = pymupdf.open(stream=content)
elif fileobj:
doc = pymupdf.open(stream=fileobj)
else:
doc = pymupdf.open(filename=filepath)
metadata = doc.metadata
title = ""
if metadata: # pragma: no branch (always metadata in test PDFs)
parts = []
for key in ["title", "author", "subject"]:
if metadata.get(key):
parts.append(metadata[key])
if parts: # pragma: no branch (always metadata in test PDFs)
title = " - ".join(parts)

content = "\n".join(
page.get_text() for page in doc # pyright: ignore[reportAttributeAccessIssue]
)

# build list of messages and filter messages which are known to not be relevant
# in our use-case
mupdf_messages = "\n".join(
warning
for warning in pymupdf.TOOLS.mupdf_warnings().splitlines()
if warning not in IGNORED_MUPDF_MESSAGES
)

if mupdf_messages:
logger.warning(
f"PyMuPDF issues:\n{mupdf_messages}"
) # pragma: no cover (no known error in test PDFs)

return IndexData(
title=title,
content=content,
)
78 changes: 77 additions & 1 deletion src/zimscraperlib/zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import libzim.writer # pyright: ignore

from zimscraperlib.download import stream_file
from zimscraperlib.filesystem import get_content_mimetype, get_file_mimetype
from zimscraperlib.zim.indexing import IndexData, get_pdf_index_data
from zimscraperlib.zim.providers import (
FileLikeProvider,
FileProvider,
Expand Down Expand Up @@ -69,7 +71,17 @@ class StaticItem(Item):
Sets a `ref` to itself on the File/String content providers so it outlives them
We need Item to survive its ContentProvider so that we can track lifecycle
more efficiently: now when the libzim destroys the CP, python will destroy
the Item and we can be notified that we're effectively through with our content"""
the Item and we can be notified that we're effectively through with our content

By default, content is automatically indexed (either by the libzim itself for
supported documents - text or html for now or by the python-scraperlib - only PDF
supported for now). If you do not want this, set `auto_index` to False to disable
both indexing (libzim and python-scraperlib).

It is also possible to pass index_data to configure custom indexing of the item.

If item title is not set by caller, it is automatically populated from index_data.
"""

def __init__(
self,
Expand All @@ -80,6 +92,9 @@ def __init__(
title: str | None = None,
mimetype: str | None = None,
hints: dict | None = None,
index_data: IndexData | None = None,
*,
auto_index: bool = True,
**kwargs: Any,
):
if content is not None:
Expand All @@ -91,6 +106,20 @@ def __init__(
super().__init__(
path=path, title=title, mimetype=mimetype, hints=hints, **kwargs
)
if index_data:
self.get_indexdata = lambda: index_data
elif not auto_index:
self.get_indexdata = lambda: IndexData("", "") # index nothing
else:
self._get_auto_index() # consider to add auto index

# Populate item title from index data if title is not set by caller
if (
(not hasattr(self, "title") or not self.title)
and hasattr(self, "get_indexdata")
and self.get_indexdata().get_title()
):
self.title = self.get_indexdata().get_title()

def get_contentprovider(self) -> libzim.writer.ContentProvider:
# content was set manually
Expand All @@ -116,6 +145,53 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider:

raise NotImplementedError("No data to provide`")

def _get_auto_index(self):
"""Populate item index data and title automatically from content"""

# content was set manually
content = getattr(self, "content", None)
if content is not None:
if not isinstance(content, (str, bytes)):
raise RuntimeError(
f"Unexpected type for content: {type(content)}"
) # pragma: no cover
mimetype = get_content_mimetype(
content.encode("utf-8") if isinstance(content, str) else content
)
if mimetype == "application/pdf":
index_data = get_pdf_index_data(content=content)
self.get_indexdata = lambda: index_data
else:
return

# using a file-like object
fileobj = getattr(self, "fileobj", None)
if fileobj:
if not isinstance(fileobj, io.BytesIO):
raise RuntimeError(
f"Unexpected type for content: {type(fileobj)}"
) # pragma: no cover
mimetype = get_content_mimetype(fileobj.getvalue())
if mimetype == "application/pdf":
index_data = get_pdf_index_data(fileobj=fileobj)
self.get_indexdata = lambda: index_data
else:
return

# using a file path
filepath = getattr(self, "filepath", None)
if filepath:
if not isinstance(filepath, pathlib.Path):
raise RuntimeError(
f"Unexpected type for content: {type(filepath)}"
) # pragma: no cover
mimetype = get_file_mimetype(filepath)
if mimetype == "application/pdf":
index_data = get_pdf_index_data(filepath=filepath)
self.get_indexdata = lambda: index_data
else:
return


class URLItem(StaticItem):
"""StaticItem to automatically fetch and feed an URL resource
Expand Down
26 changes: 26 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,32 @@ def webp_image():
return file_src("ninja.webp")


@pytest.fixture(scope="module")
def encrypted_pdf_file():
"""Return an encrypted PDF

encrypted.pdf is a PDF encrypted with only a owner password (restricting edit/print)
we want to be sure we are capable to also index this kind of PDF documents, since
they are readable by most popular readers without any issue (view is unrestricted).
"""
return file_src("encrypted.pdf")


@pytest.fixture(scope="module")
def encrypted_pdf_content():
return file_src("encrypted.txt")


@pytest.fixture(scope="module")
def big_pdf_file():
return file_src("milderm.pdf")


@pytest.fixture(scope="module")
def big_pdf_content():
return file_src("milderm.txt")


@pytest.fixture(scope="module")
def valid_user_agent():
return "name/version (contact)"
Expand Down
Binary file added tests/files/encrypted.pdf
Binary file not shown.
3 changes: 3 additions & 0 deletions tests/files/encrypted.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Placeholder Documentation
This document is a placeholder for the appropriate documentation.

Binary file added tests/files/milderm.pdf
Binary file not shown.
Loading