Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Add utility function to compute ZIM Tags #164, including deduplication #156
- Metadata does not automatically drops control characters #159

### Fixed

Expand Down
16 changes: 16 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import libzim.writer # pyright: ignore
import PIL.Image
import regex

from zimscraperlib import logger
from zimscraperlib.constants import (
Expand Down Expand Up @@ -65,6 +66,9 @@
re.MULTILINE | re.DOTALL,
)

# All control characters are disallowed in str metadata except \n, \r and \t
UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}")


def mimetype_for(
path: str,
Expand Down Expand Up @@ -250,6 +254,11 @@ def add_metadata(
content: str | bytes | datetime.date | datetime.datetime | Iterable[str],
mimetype: str = "text/plain;charset=UTF-8",
):
# drop control characters before passing them to libzim
if isinstance(content, str):
content = UNWANTED_CONTROL_CHARACTERS_REGEX.sub("", content).strip(
" \r\n\t"
)
if not self.disable_metadata_checks:
self.validate_metadata(name, content)
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
Expand Down Expand Up @@ -304,6 +313,13 @@ def config_metadata(
}
)
self._metadata.update(extras)
for metadata_key, metadata_value in self._metadata.items():
# drop control characters so that proper value is stored in memory and
# logged in DEBUG mode ; also strip blank characters
if isinstance(metadata_value, str):
self._metadata[metadata_key] = UNWANTED_CONTROL_CHARACTERS_REGEX.sub(
"", metadata_value
).strip(" \r\n\t")
return self

def config_dev_metadata(self, **extras: str):
Expand Down
51 changes: 51 additions & 0 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,57 @@ def test_config_metadata(tmp_path, png_image, tags):
assert reader.get_text_metadata("TestMetadata") == "Test Metadata"


def test_config_metadata_control_characters(tmp_path):
fpath = tmp_path / "test_config.zim"
creator = Creator(fpath, "").config_dev_metadata(
Description="\t\n\r\n \tA description \awith \bcontrol characters\v",
LongDescription="A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t",
Creator=" A creator ",
)
assert creator._metadata["Description"] == "A description with control characters"
assert (
creator._metadata["LongDescription"]
== "A description \rwith \ncontrol characters\tsss"
)
assert creator._metadata["Creator"] == "A creator"
with creator:
creator.add_metadata(
"Description_1",
"\t\n\r\n \tA description \awith \bcontrol characters\v",
)
creator.add_metadata(
"LongDescription_1",
"A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t",
)
creator.add_metadata(
"Creator_1",
" A creator ",
)
pass

assert fpath.exists()

reader = Archive(fpath)
assert (
reader.get_text_metadata("Description")
== "A description with control characters"
)
assert (
reader.get_text_metadata("LongDescription")
== "A description \rwith \ncontrol characters\tsss"
)
assert reader.get_text_metadata("Creator") == "A creator"
assert (
reader.get_text_metadata("Description_1")
== "A description with control characters"
)
assert (
reader.get_text_metadata("LongDescription_1")
== "A description \rwith \ncontrol characters\tsss"
)
assert reader.get_text_metadata("Creator_1") == "A creator"


@pytest.mark.parametrize(
"name,value,valid",
[
Expand Down