-
-
Notifications
You must be signed in to change notification settings - Fork 19
Add indexdata + automatic indexing of PDF items #182
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
""" Special item with customized index data and helper classes """ | ||
|
||
from __future__ import annotations | ||
|
||
import io | ||
import pathlib | ||
|
||
import libzim.writer # pyright: ignore | ||
|
||
try: | ||
import pymupdf | ||
except ImportError: # pragma: no cover | ||
import fitz as pymupdf # pymupdf main module was named fitz before 1.24.3 | ||
|
||
from zimscraperlib import logger | ||
|
||
|
||
class IndexData(libzim.writer.IndexData): | ||
"""IndexData to properly pass indexing title and content to the libzim | ||
|
||
Both title and content have to be customized (title can be identical to item title | ||
or not). | ||
keywords is optional since it can be empty | ||
wordcount is optional ; if not passed, it is automaticaly computed from content | ||
""" | ||
|
||
def __init__( | ||
self, title: str, content: str, keywords: str = "", wordcount: int | None = None | ||
): | ||
# set wordcount first so that we know if we should override it based on content | ||
self.wordcount = wordcount | ||
self.title = title | ||
self.content = content | ||
self.keywords = keywords | ||
|
||
def has_indexdata(self) -> bool: | ||
return len(self.content) > 0 or len(self.title) > 0 | ||
|
||
def get_title(self) -> str: | ||
return self.title | ||
|
||
def get_content(self) -> str: | ||
return self.content | ||
|
||
def get_keywords(self) -> str: | ||
return self.keywords | ||
|
||
def get_wordcount(self) -> int: | ||
return self.wordcount or 0 | ||
|
||
@property | ||
def content(self): | ||
return self._content | ||
|
||
@content.setter | ||
def content(self, value: str): | ||
self._content = value | ||
if not self.wordcount: | ||
self.wordcount = len(self.content.split()) if self.content else 0 | ||
|
||
|
||
IGNORED_MUPDF_MESSAGES = [ | ||
"lcms: not an ICC profile, invalid signature.", | ||
"format error: cmsOpenProfileFromMem failed", | ||
"ignoring broken ICC profile", | ||
] | ||
|
||
|
||
def get_pdf_index_data( | ||
*, | ||
content: str | bytes | None = None, | ||
fileobj: io.BytesIO | None = None, | ||
filepath: pathlib.Path | None = None, | ||
) -> IndexData: | ||
"""Returns the IndexData information for a given PDF | ||
|
||
PDF can be passed either as content or fileobject or filepath | ||
""" | ||
|
||
# do not display all pymupdf errors, we will filter them afterwards | ||
pymupdf.TOOLS.mupdf_display_errors(False) | ||
|
||
if content: | ||
doc = pymupdf.open(stream=content) | ||
elif fileobj: | ||
doc = pymupdf.open(stream=fileobj) | ||
else: | ||
doc = pymupdf.open(filename=filepath) | ||
metadata = doc.metadata | ||
title = "" | ||
if metadata: # pragma: no branch (always metadata in test PDFs) | ||
parts = [] | ||
for key in ["title", "author", "subject"]: | ||
if metadata.get(key): | ||
parts.append(metadata[key]) | ||
if parts: # pragma: no branch (always metadata in test PDFs) | ||
title = " - ".join(parts) | ||
|
||
content = "\n".join( | ||
page.get_text() for page in doc # pyright: ignore[reportAttributeAccessIssue] | ||
) | ||
|
||
# build list of messages and filter messages which are known to not be relevant | ||
# in our use-case | ||
mupdf_messages = "\n".join( | ||
warning | ||
for warning in pymupdf.TOOLS.mupdf_warnings().splitlines() | ||
if warning not in IGNORED_MUPDF_MESSAGES | ||
) | ||
|
||
if mupdf_messages: | ||
logger.warning( | ||
f"PyMuPDF issues:\n{mupdf_messages}" | ||
) # pragma: no cover (no known error in test PDFs) | ||
|
||
return IndexData( | ||
title=title, | ||
content=content, | ||
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Placeholder Documentation | ||
This document is a placeholder for the appropriate documentation. | ||
|
Binary file not shown.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.