From 03ac98bd4d2301b3264847c19894289aa76940c9 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 16 Jun 2020 11:12:21 +0000 Subject: [PATCH 1/7] Fixed #29: using default values on Creator init --- libzim/writer.py | 2 +- tests/test_libzim.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/libzim/writer.py b/libzim/writer.py index 2a3893dc..07c17042 100644 --- a/libzim/writer.py +++ b/libzim/writer.py @@ -135,7 +135,7 @@ class Creator: Zim file metadata """ - def __init__(self, filename, main_page, index_language, min_chunk_size): + def __init__(self, filename, main_page, index_language="eng", min_chunk_size=2048): self._creatorWrapper = _Creator(filename, main_page, index_language, min_chunk_size) self.filename = filename self.main_page = main_page diff --git a/tests/test_libzim.py b/tests/test_libzim.py index 37979166..f9bd34e5 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -184,3 +184,11 @@ def test_double_close(tmpdir): creator.close() with pytest.raises(RuntimeError): creator.close() + + +def test_default_creator_params(tmpdir): + """ ensure we can init a Creator without specifying all params """ + creator = Creator(str(tmpdir / "test.zim"), "welcome") + assert True # we could init the Creator without specifying other params + assert creator.language == "eng" + assert creator.main_page == "welcome" From 0080f5873918ef3494e99ccfa815c9372157b533 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 16 Jun 2020 11:47:27 +0000 Subject: [PATCH 2/7] Fixed #37: fixed __repr__ for ReadArticle and File, added on Article --- libzim/wrapper.pyx | 4 ++-- libzim/writer.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/libzim/wrapper.pyx b/libzim/wrapper.pyx index ab8e5e0e..0ec7c464 100644 --- a/libzim/wrapper.pyx +++ b/libzim/wrapper.pyx @@ -335,7 +335,7 @@ cdef class ReadArticle: return article def __repr__(self): - return f"{self.__class__.__name__}(url={self.longurl}, title=)" + return f"{self.__class__.__name__}(url={self.longurl}, title={self.title})" @@ -575,4 +575,4 @@ cdef class FilePy: return dereference(search).get_matches_estimated() def __repr__(self): - return f"{self.__class__.__name__}(filename={self.filename}" + return f"{self.__class__.__name__}(filename={self.filename})" diff --git a/libzim/writer.py b/libzim/writer.py index 07c17042..a473f86f 100644 --- a/libzim/writer.py +++ b/libzim/writer.py @@ -63,6 +63,9 @@ def _get_data(self): def get_data(self): raise NotImplementedError + def __repr__(self): + return f"{self.__class__.__name__}(url={self.get_url()}, title={self.get_title()})" + class MetadataArticle(Article): def __init__(self, url, metadata_content): From 70a74b19063e18417c05557f80015dae4c8e3579 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 16 Jun 2020 12:27:41 +0000 Subject: [PATCH 3/7] Fixed #38: raising NotFound on unavailable article raise RuntimeError on out-of-bound get_article_by_id() changed get_article_by_id signature to not use the reserved python keyword `id` --- libzim/reader.py | 2 +- libzim/wrapper.pyx | 17 +++++++++++------ tests/test_libzim_file_reader.py | 6 +++--- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/libzim/reader.py b/libzim/reader.py index bef2f925..fe095479 100644 --- a/libzim/reader.py +++ b/libzim/reader.py @@ -1,4 +1,4 @@ # flake8: noqa -from .wrapper import FilePy as File +from .wrapper import FilePy as File, NotFound from .wrapper import ReadArticle as Article diff --git a/libzim/wrapper.pyx b/libzim/wrapper.pyx index 0ec7c464..452eb3ae 100644 --- a/libzim/wrapper.pyx +++ b/libzim/wrapper.pyx @@ -31,6 +31,9 @@ from libcpp.memory cimport shared_ptr, make_shared, unique_ptr import datetime +class NotFound(RuntimeError): + pass + ######################### # Blob # ######################### @@ -392,13 +395,13 @@ cdef class FilePy: The Article object Raises ------ - RuntimeError + NotFound If an article with the provided long url is not found in the file """ # Read to a zim::Article cdef wrapper.Article art = self.c_file.getArticleByUrl(url.encode('UTF-8')) if not art.good(): - raise RuntimeError("Article not found for url") + raise NotFound("Article not found for url") article = ReadArticle.from_read_article(art) return article @@ -413,12 +416,12 @@ cdef class FilePy: article = self.get_article(f"M/{name}") return article.content - def get_article_by_id(self, id): + def get_article_by_id(self, article_id): """Get a ZimFileArticle with a copy of the file article by article id. Parameters ---------- - id : int + article_id : int The id of the article Returns ------- @@ -427,13 +430,15 @@ cdef class FilePy: Raises ------ RuntimeError + If there is a problem in retrieving article (ex: id is out of bound) + NotFound If an article with the provided id is not found in the file """ # Read to a zim::Article - cdef wrapper.Article art = self.c_file.getArticle( id) + cdef wrapper.Article art = self.c_file.getArticle( article_id) if not art.good(): - raise RuntimeError("Article not found for id") + raise NotFound("Article not found for id") article = ReadArticle.from_read_article(art) return article diff --git a/tests/test_libzim_file_reader.py b/tests/test_libzim_file_reader.py index 53cf6940..2f5c5d68 100644 --- a/tests/test_libzim_file_reader.py +++ b/tests/test_libzim_file_reader.py @@ -3,7 +3,7 @@ import pytest -from libzim.reader import File +from libzim.reader import File, NotFound DATA_DIR = Path(__file__).parent @@ -110,9 +110,9 @@ def test_search(reader): def test_get_wrong_article(reader): - with pytest.raises(RuntimeError): + with pytest.raises(RuntimeError): # out of range reader.get_article_by_id(reader.article_count + 100) - with pytest.raises(RuntimeError): + with pytest.raises(NotFound): reader.get_article("A/I_do_not_exists") From 2fb30e0e060465faae76ad04b44325ae1bd4be3b Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 16 Jun 2020 12:32:47 +0000 Subject: [PATCH 4/7] Fixed #30: get_metadata() now returns bytes mandatory metadata references gone, as per the spec --- libzim/wrapper.pyx | 10 +++++----- libzim/writer.py | 20 ++------------------ tests/test_libzim.py | 15 --------------- 3 files changed, 7 insertions(+), 38 deletions(-) diff --git a/libzim/wrapper.pyx b/libzim/wrapper.pyx index 452eb3ae..e1b67cc0 100644 --- a/libzim/wrapper.pyx +++ b/libzim/wrapper.pyx @@ -31,6 +31,7 @@ from libcpp.memory cimport shared_ptr, make_shared, unique_ptr import datetime + class NotFound(RuntimeError): pass @@ -407,14 +408,13 @@ cdef class FilePy: return article def get_metadata(self, name): - """Get the file metadata. + """Get a metadata Returns ------- - dict - A dictionary with the file metadata + bytes + Metadata article's content. Can be of any type. """ - article = self.get_article(f"M/{name}") - return article.content + return bytes(self.get_article(f"M/{name}").content) def get_article_by_id(self, article_id): """Get a ZimFileArticle with a copy of the file article by article id. diff --git a/libzim/writer.py b/libzim/writer.py index a473f86f..7bd9f658 100644 --- a/libzim/writer.py +++ b/libzim/writer.py @@ -98,17 +98,6 @@ def get_data(self): return Blob(self.metadata_content) -MANDATORY_METADATA_KEYS = [ - "Name", - "Title", - "Creator", - "Publisher", - "Date", - "Description", - "Language", -] - - def pascalize(keyword): """ Converts python case to pascal case. example: long_description-> LongDescription """ return "".join(keyword.title().split("_")) @@ -165,13 +154,8 @@ def _update_article_counter(self, article): # default dict update self._article_counter[article.get_mime_type().strip()] += 1 - def mandatory_metadata_ok(self): - """Flag if mandatory metadata is complete and not empty""" - metadata_item_ok = [k in self._metadata for k in MANDATORY_METADATA_KEYS] - return all(metadata_item_ok) - - def update_metadata(self, **kwargs): - "Updates article metadata" "" + def update_metadata(self, **kwargs: str): + """ Updates Creator metadata for ZIM, supplied as keyword arguments """ new_metadata = {pascalize(k): v for k, v in kwargs.items()} self._metadata.update(new_metadata) diff --git a/tests/test_libzim.py b/tests/test_libzim.py index f9bd34e5..55b04054 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -128,21 +128,6 @@ def test_article_metadata(tmpdir, metadata): assert zim_creator._metadata == metadata -def test_check_mandatory_metadata(tmpdir): - with Creator( - str(tmpdir / "test.zim"), main_page="welcome", index_language="eng", min_chunk_size=2048, - ) as zim_creator: - assert not zim_creator.mandatory_metadata_ok() - zim_creator.update_metadata( - creator="python-libzim", - description="Created in python", - name="Hola", - publisher="Monadical", - title="Test Zim", - ) - assert zim_creator.mandatory_metadata_ok() - - def test_creator_params(tmpdir): path = str(tmpdir / "test.zim") main_page = "welcome" From 58d777fab1439069c5ecd17e74ac241a9aaef94e Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 16 Jun 2020 13:47:56 +0000 Subject: [PATCH 5/7] Fixed #45: Creator now accepts pathlib for filename Creator converts passed filename to str before passing it to wrapper. This allows transparent use of pathlib.Path Also, Creator.filename now returns a pathlib.Path, created on __init__ from the wrapper returned filename. Tests updated to use pathlib by default and check .filename type. Test ensures passing regular str still works. --- libzim/wrapper.pyx | 9 +++++---- libzim/writer.py | 8 ++++---- tests/test_libzim.py | 25 ++++++++++++++++++------- tests/test_libzim_file_reader.py | 3 ++- 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/libzim/wrapper.pyx b/libzim/wrapper.pyx index e1b67cc0..8219cfc8 100644 --- a/libzim/wrapper.pyx +++ b/libzim/wrapper.pyx @@ -29,6 +29,7 @@ from libcpp.string cimport string from libcpp cimport bool from libcpp.memory cimport shared_ptr, make_shared, unique_ptr +import pathlib import datetime @@ -356,14 +357,14 @@ cdef class FilePy: ---------- *c_file : File a pointer to a C++ File object - _filename : str + _filename : pathlib.Path the file name of the File Reader object """ cdef wrapper.File *c_file cdef object _filename - def __cinit__(self, str filename): + def __cinit__(self, object filename): """Constructs a File from full zim file path. Parameters ---------- @@ -371,8 +372,8 @@ cdef class FilePy: Full path to a zim file """ - self.c_file = new wrapper.File(filename.encode('UTF-8')) - self._filename = self.c_file.getFilename().decode("UTF-8", "strict") + self.c_file = new wrapper.File(str(filename).encode('UTF-8')) + self._filename = pathlib.Path(self.c_file.getFilename().decode("UTF-8", "strict")) def __dealloc__(self): if self.c_file != NULL: diff --git a/libzim/writer.py b/libzim/writer.py index 7bd9f658..3b43f0d5 100644 --- a/libzim/writer.py +++ b/libzim/writer.py @@ -17,7 +17,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . - +import pathlib import datetime from collections import defaultdict @@ -113,7 +113,7 @@ class Creator: a pointer to the C++ Creator object _finalized : bool flag if the creator was finalized - _filename : str + _filename : pathlib.Path Zim file path _main_page : str Zim file main page @@ -128,8 +128,8 @@ class Creator: """ def __init__(self, filename, main_page, index_language="eng", min_chunk_size=2048): - self._creatorWrapper = _Creator(filename, main_page, index_language, min_chunk_size) - self.filename = filename + self._creatorWrapper = _Creator(str(filename), main_page, index_language, min_chunk_size) + self.filename = pathlib.Path(filename) self.main_page = main_page self.language = index_language self._metadata = {} diff --git a/tests/test_libzim.py b/tests/test_libzim.py index 55b04054..9552baa8 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -17,6 +17,7 @@ # along with this program. If not, see . import pytest +import pathlib from libzim.writer import Article, Blob, Creator from libzim.reader import File @@ -108,7 +109,7 @@ def article(article_content): def test_write_article(tmpdir, article): with Creator( - str(tmpdir / "test.zim"), main_page="welcome", index_language="eng", min_chunk_size=2048, + tmpdir / "test.zim", main_page="welcome", index_language="eng", min_chunk_size=2048, ) as zim_creator: zim_creator.add_article(article) zim_creator.update_metadata( @@ -122,14 +123,14 @@ def test_write_article(tmpdir, article): def test_article_metadata(tmpdir, metadata): with Creator( - str(tmpdir / "test.zim"), main_page="welcome", index_language="eng", min_chunk_size=2048, + tmpdir / "test.zim", main_page="welcome", index_language="eng", min_chunk_size=2048, ) as zim_creator: zim_creator.update_metadata(**metadata) assert zim_creator._metadata == metadata def test_creator_params(tmpdir): - path = str(tmpdir / "test.zim") + path = tmpdir / "test.zim" main_page = "welcome" main_page_url = f"A/{main_page}" index_language = "eng" @@ -148,7 +149,7 @@ def test_creator_params(tmpdir): def test_segfault_on_realloc(tmpdir): """ assert that we are able to delete an unclosed Creator #31 """ - creator = Creator(str(tmpdir / "test.zim"), "welcome", "eng", 2048) + creator = Creator(tmpdir / "test.zim", "welcome", "eng", 2048) del creator # used to segfault assert True @@ -157,7 +158,7 @@ def test_noleftbehind_empty(tmpdir): """ assert that ZIM with no articles don't leave files behind #41 """ fname = "test_empty.zim" with Creator( - str(tmpdir / fname), main_page="welcome", index_language="eng", min_chunk_size=2048, + tmpdir / fname, main_page="welcome", index_language="eng", min_chunk_size=2048, ) as zim_creator: print(zim_creator) @@ -165,7 +166,7 @@ def test_noleftbehind_empty(tmpdir): def test_double_close(tmpdir): - creator = Creator(str(tmpdir / "test.zim"), "welcome", "eng", 2048) + creator = Creator(tmpdir / "test.zim", "welcome", "eng", 2048) creator.close() with pytest.raises(RuntimeError): creator.close() @@ -173,7 +174,17 @@ def test_double_close(tmpdir): def test_default_creator_params(tmpdir): """ ensure we can init a Creator without specifying all params """ - creator = Creator(str(tmpdir / "test.zim"), "welcome") + creator = Creator(tmpdir / "test.zim", "welcome") assert True # we could init the Creator without specifying other params assert creator.language == "eng" assert creator.main_page == "welcome" + + +def test_filename_param_types(tmpdir): + path = tmpdir / "test.zim" + with Creator(path, "welcome") as creator: + assert creator.filename == path + assert isinstance(creator.filename, pathlib.Path) + with Creator(str(path), "welcome") as creator: + assert creator.filename == path + assert isinstance(creator.filename, pathlib.Path) diff --git a/tests/test_libzim_file_reader.py b/tests/test_libzim_file_reader.py index 2f5c5d68..13edece1 100644 --- a/tests/test_libzim_file_reader.py +++ b/tests/test_libzim_file_reader.py @@ -10,7 +10,7 @@ ZIMFILES = [ { - "filename": str(DATA_DIR / "wikipedia_es_physics_mini.zim"), + "filename": DATA_DIR / "wikipedia_es_physics_mini.zim", "checksum": "99ea7a5598c6040c4f50b8ac0653b703", "namespaces": "-AIMX", "article_count": 22027, @@ -43,6 +43,7 @@ def article_data(): def test_zim_filename(reader, zimdata): for k, v in zimdata.items(): assert getattr(reader, k) == v + assert isinstance(reader.filename, Path) def test_zim_read(reader, article_data): From 94882c2e302022183dd3716fca46d420376f6924 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Wed, 24 Jun 2020 20:29:09 +0000 Subject: [PATCH 6/7] Cleaner writer API - fixed, added or updated docstrings - added type annotations to ease self-discovery - made `mandatory_metadata_ok` a property - added a `_closed` boolean prop in creator to record finalization status - fixed the deletion issue after a contextmanager auto-close (using `self._closed`) - simplify public API by moving single-use method: - `_get_counter_string()` and `write_metadata()` to `close()` - `_update_article_counter()` to `add_article()` - `pascalize()` to `update_metadata()` - `write_metadata()` to autmatically convert Date from datetime.datetime as well --- libzim/writer.py | 193 ++++++++++++++++++++++++++++++----------------- 1 file changed, 123 insertions(+), 70 deletions(-) diff --git a/libzim/writer.py b/libzim/writer.py index 3b43f0d5..8c363423 100644 --- a/libzim/writer.py +++ b/libzim/writer.py @@ -1,3 +1,16 @@ +""" libzim writer module + - Creator to create ZIM files + - Article to store ZIM articles metadata + - Blob to store ZIM article content + Usage: + with Creator(pathlib.Path("myfile.zim"), main_page="welcome.html") as xf: + article = MyArticleSubclass( + url="A/welcome.html", + title="My Title", + content=Blob("My content")) + zf.add_article(article) + zf.update_metadata(tags="new;demo") """ + # This file is part of python-libzim # (see https://github.com/libzim/python-libzim) # @@ -19,7 +32,7 @@ import pathlib import datetime -from collections import defaultdict +import collections from .wrapper import Creator as _Creator from .wrapper import WritingBlob as Blob @@ -28,113 +41,132 @@ class Article: + """ Article stub to override + + Pass a subclass of it to Creator.add_article() """ + def __init__(self): self._blob = None - def get_url(self): + def get_url(self) -> str: + """ Full URL of article including namespace """ raise NotImplementedError - def get_title(self): + def get_title(self) -> str: + """ Article title. Might be indexed and used in suggestions """ raise NotImplementedError - def is_redirect(self): + def is_redirect(self) -> bool: + """ Whether this redirects to another article (cf. redirec_url) """ raise NotImplementedError - def get_mime_type(self): + def get_mime_type(self) -> str: + """ MIME-type of the article's content. A/ namespace reserved to text/html """ raise NotImplementedError - def get_filename(self): + def get_filename(self) -> str: + """ Filename to get content from. Blank string "" if not used """ raise NotImplementedError - def should_compress(self): + def should_compress(self) -> bool: + """ Whether the article's content should be compressed or not """ raise NotImplementedError - def should_index(self): + def should_index(self) -> bool: + """ Whether the article's content should be indexed or not """ raise NotImplementedError - def redirect_url(self): + def redirect_url(self) -> str: + """ Full URL including namespace of another article """ raise NotImplementedError - def _get_data(self): + def _get_data(self) -> Blob: + """ Internal data-retrieval with a cache to the content's pointer + + You don't need to override this """ if self._blob is None: self._blob = self.get_data() return self._blob - def get_data(self): + def get_data(self) -> Blob: + """ Blob containing the complete content of the article """ raise NotImplementedError - def __repr__(self): + def __repr__(self) -> str: return f"{self.__class__.__name__}(url={self.get_url()}, title={self.get_title()})" class MetadataArticle(Article): - def __init__(self, url, metadata_content): + """ Simple Article sub-class for key-value articles on M/ metadata namespace """ + + def __init__(self, url: str, metadata_content: str): Article.__init__(self) self.url = url self.metadata_content = metadata_content - def is_redirect(self): + def is_redirect(self) -> bool: return False - def get_url(self): + def get_url(self) -> str: return f"M/{self.url}" - def get_title(self): + def get_title(self) -> str: return "" - def get_mime_type(self): + def get_mime_type(self) -> str: return "text/plain" - def get_filename(self): + def get_filename(self) -> str: return "" - def should_compress(self): + def should_compress(self) -> bool: return True - def should_index(self): + def should_index(self) -> bool: return False - def get_data(self): + def get_data(self) -> Blob: return Blob(self.metadata_content) -def pascalize(keyword): - """ Converts python case to pascal case. example: long_description-> LongDescription """ - return "".join(keyword.title().split("_")) - - class Creator: - """ - A class to represent a Zim Creator. - - Attributes - ---------- - *c_creator : zim.Creator - a pointer to the C++ Creator object - _finalized : bool - flag if the creator was finalized - _filename : pathlib.Path - Zim file path - _main_page : str - Zim file main page - _index_language : str - Zim file Index language - _min_chunk_size : str - Zim file minimum chunk size - _article_counter - Zim file article counter - _metadata - Zim file metadata - """ - - def __init__(self, filename, main_page, index_language="eng", min_chunk_size=2048): + """ Zim Creator. + + Attributes + ---------- + *_creatorWrapper : wrapper.ZimCreatorWrapper + a pointer to the C++ Creator object wrapper + filename : pathlib.Path + Zim file path + main_page : str + Zim file main page (without namespace) + language : str + Zim file Index language + _article_counter + Zim file article counter + _metadata + Zim file metadata """ + + def __init__( + self, filename: pathlib.Path, main_page: str, index_language: str = "eng", min_chunk_size: int = 2048, + ): + """ Creates a ZIM Creator + + Parameters + ---------- + filename : Path to create the ZIM file at + main_page: ZIM file main article URL (without namespace, must be in A/) + index_language: content language to inform indexer with (ISO-639-3) + min_chunk_size: minimum size of chunks for compression """ + self._creatorWrapper = _Creator(str(filename), main_page, index_language, min_chunk_size) self.filename = pathlib.Path(filename) self.main_page = main_page self.language = index_language self._metadata = {} - self._article_counter = defaultdict(int) + self._article_counter = collections.defaultdict(int) self.update_metadata(date=datetime.date.today(), language=index_language) + self._closed = False def __enter__(self): return self @@ -143,38 +175,59 @@ def __exit__(self, *args): self.close() def __del__(self): - self.close() - - def add_article(self, article): + if not self._closed: + self.close() + + def add_article(self, article: Article): + """ Adds an article to the Creator. + + Parameters + ---------- + article : Zim writer Article + The article to add to the file + Raises + ------ + RuntimeError + If the ZimCreator was already finalized """ self._creatorWrapper.add_article(article) if not article.is_redirect(): - self._update_article_counter(article) - - def _update_article_counter(self, article): - # default dict update - self._article_counter[article.get_mime_type().strip()] += 1 + # update article counter + self._article_counter[article.get_mime_type().strip()] += 1 def update_metadata(self, **kwargs: str): """ Updates Creator metadata for ZIM, supplied as keyword arguments """ - new_metadata = {pascalize(k): v for k, v in kwargs.items()} - self._metadata.update(new_metadata) - def write_metadata(self): + def pascalize(keyword: str): + """ Converts python case to pascal case. + + example: long_description -> LongDescription """ + return "".join(keyword.title().split("_")) + + self._metadata.update({pascalize(k): v for k, v in kwargs.items()}) + + def close(self): + """ Finalizes and writes added articles to the file + + Raises + ------ + RuntimeError + If the ZimCreator was already finalized """ + if self._closed: + raise RuntimeError("Creator already closed") + + # Store _medtadata dict as MetadataArticle for key, value in self._metadata.items(): - if key == "Date" and isinstance(value, datetime.date): + if key == "Date" and isinstance(value, (datetime.date, datetime.datetime)): value = value.strftime("%Y-%m-%d") article = MetadataArticle(key, value) self._creatorWrapper.add_article(article) - article = MetadataArticle("Counter", self._get_counter_string()) + counter_str = ";".join([f"{k}={v}" for (k, v) in self._article_counter.items()]) + article = MetadataArticle("Counter", counter_str) self._creatorWrapper.add_article(article) - def _get_counter_string(self): - return ";".join(["%s=%s" % (k, v) for (k, v) in self._article_counter.items()]) - - def close(self): - self.write_metadata() self._creatorWrapper.finalize() + self._closed = True - def __repr__(self): + def __repr__(self) -> str: return f"Creator(filename={self.filename})" From 52e9e38b3f7bb0e212823d524be41210fffb0f79 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Fri, 19 Jun 2020 22:10:28 +0000 Subject: [PATCH 7/7] Cleaner reader API - added types annotations in docstring as cython doesn't forward it to gen code - added a dummy __enter__ and __exit__ so user can use it as a contextmanager for clarity --- libzim/reader.py | 14 +- libzim/wrapper.pyx | 445 +++++++++++++++++++++------------------------ 2 files changed, 222 insertions(+), 237 deletions(-) diff --git a/libzim/reader.py b/libzim/reader.py index fe095479..2a1f4d03 100644 --- a/libzim/reader.py +++ b/libzim/reader.py @@ -1,4 +1,16 @@ -# flake8: noqa +""" libzim reader module + + - File to open and read ZIM files + - Article are returned by File on get_article() and get_article_by_id() + - NotFound is raised on incorrect article URL query + + Usage: + with File(pathlib.Path("myfile.zim")) as zf: + article = zf.get_article(zf.main_page_url) + print(f"Article {article.title} at {article.url} is {article.content.nbytes}b") + """ + +# flake8: noqa from .wrapper import FilePy as File, NotFound from .wrapper import ReadArticle as Article diff --git a/libzim/wrapper.pyx b/libzim/wrapper.pyx index 8219cfc8..5f3a9b16 100644 --- a/libzim/wrapper.pyx +++ b/libzim/wrapper.pyx @@ -20,6 +20,7 @@ cimport libzim.wrapper as wrapper +from typing import Generator from cython.operator import dereference, preincrement from cpython.ref cimport PyObject from cpython.buffer cimport PyBUF_WRITABLE @@ -140,33 +141,31 @@ cdef public api: return func() cdef class Creator: - """ - A class to represent a Zim Creator. + """ Zim Creator - Attributes - ---------- - *c_creator : zim.ZimCreator - a pointer to the C++ Creator object - _finalized : bool - flag if the creator was finalized - """ + Attributes + ---------- + *c_creator : zim.ZimCreatorWrapper + a pointer to the C++ Creator object + _finalized : bool + flag if the creator was finalized """ cdef wrapper.ZimCreatorWrapper *c_creator cdef bool _finalized - def __cinit__(self, str filename, str main_page = "", str index_language = "eng", min_chunk_size = 2048): - """Constructs a ZimCreator from parameters. - Parameters - ---------- - filename : str - Zim file path - main_page : str - Zim file main page - index_language : str - Zim file index language (default eng) - min_chunk_size : int - Minimum chunk size (default 2048) - """ + def __cinit__(self, str filename: str, str main_page: str = "", str index_language: str = "eng", min_chunk_size: int = 2048): + """ Constructs a Zim Creator from parameters. + + Parameters + ---------- + filename : str + Zim file path + main_page : str + Zim file main page (without namespace, must be in A/) + index_language : str + Zim file index language (default eng) + min_chunk_size : int + Minimum chunk size (default 2048) """ self.c_creator = wrapper.ZimCreatorWrapper.create(filename.encode("UTF-8"), main_page.encode("UTF-8"), index_language.encode("UTF-8"), min_chunk_size) self._finalized = False @@ -175,17 +174,16 @@ cdef class Creator: del self.c_creator def add_article(self, article not None): - """Add a article to the Creator object. - - Parameters - ---------- - article : ZimArticle - The article to add to the file - Raises - ------ - RuntimeError - If the ZimCreator was already finalized - """ + """ Add an article to the Creator object. + + Parameters + ---------- + article : ZimArticle + The article to add to the file + Raises + ------ + RuntimeError + If the ZimCreator was already finalized """ if self._finalized: raise RuntimeError("ZimCreator already finalized") @@ -196,13 +194,12 @@ cdef class Creator: self.c_creator.addArticle(art) def finalize(self): - """finalize and write added articles to the file. + """ Finalize creation and write added articles to the file. - Raises - ------ - RuntimeError - If the ZimCreator was already finalized - """ + Raises + ------ + RuntimeError + If the ZimCreator was already finalized """ if self._finalized: raise RuntimeError("ZimCreator already finalized") with nogil: @@ -214,36 +211,12 @@ cdef class Creator: ######################## cdef class ReadArticle: - """ - A class to represent a Zim File Article. - - Attributes - ---------- - *c_article : Article (zim::) - a pointer to the C++ article object - - Properties - ----------- - namespace : str - the article namespace - title : str - the article title - content : str - the article content - longurl : str - the article long url i.e {NAMESPACE}/{redirect_url} - url : str - the article url - mimetype : str - the article mimetype - is_redirect : bool - flag if the article is a redirect - - Methods - ------- - from_read_article(zim.Article art) - Creates a python ZimArticle from a C++ zim.Article article. - """ + """ Article in a Zim file + + Attributes + ---------- + *c_article : Article (zim::) + a pointer to the C++ article object """ cdef wrapper.Article c_article cdef ReadingBlob _blob cdef bool _haveBlob @@ -257,50 +230,47 @@ cdef class ReadArticle: self._haveBlob = False cdef __setup(self, wrapper.Article art): - """Assigns an internal pointer to the wrapped C++ article object. + """ Assigns an internal pointer to the wrapped C++ article object. - Parameters - ---------- - *art : Article - Pointer to a C++ (zim::) article object - """ + Parameters + ---------- + *art : Article + Pointer to a C++ (zim::) article object """ # Set new internal C zim.ZimArticle article self.c_article = art self._blob = None - - # Factory functions - Currently Cython can't use classmethods @staticmethod cdef from_read_article(wrapper.Article art): - """Creates a python ZimFileArticle from a C++ Article (zim::). - - Parameters - ---------- - art : Article - A C++ Article read with File - Return - ------ - - """ + """ Creates a python ReadArticle from a C++ Article (zim::) -> ReadArticle + + Parameters + ---------- + art : Article + A C++ Article read with File + Returns + ------ + ReadArticle + Casted article """ cdef ReadArticle article = ReadArticle() article.__setup(art) return article @property - def namespace(self): - """Get the article's namespace""" + def namespace(self) -> str: + """ Article's namespace -> str """ ns = self.c_article.getNamespace() return chr(ns) @property - def title(self): - """Get the article's title""" + def title(self) -> str: + """ Article's title -> str """ return self.c_article.getTitle().decode('UTF-8') @property - def content(self): - """Get the article's content""" + def content(self) -> memoryview: + """ Article's content -> memoryview """ if not self._haveBlob: self._blob = ReadingBlob() self._blob.__setup(self.c_article.getData( 0)) @@ -308,27 +278,27 @@ cdef class ReadArticle: return memoryview(self._blob) @property - def longurl(self): - """Get the article's long url i.e {NAMESPACE}/{url}""" + def longurl(self) -> str: + """ Article's long url, including namespace -> str""" return self.c_article.getLongUrl().decode("UTF-8", "strict") @property - def url(self): - """Get the article's url""" + def url(self) -> str: + """ Article's url without namespace -> str """ return self.c_article.getUrl().decode("UTF-8", "strict") @property - def mimetype(self): - """Get the article's mimetype""" + def mimetype(self) -> str: + """ Article's mimetype -> str """ return self.c_article.getMimeType().decode('UTF-8') @property - def is_redirect(self): - """Get if the article is a redirect""" + def is_redirect(self) -> bool: + """ Whether article is a redirect -> bool """ return self.c_article.isRedirect() def get_redirect_article(self) -> ReadArticle: - """ Target ReadArticle of this one """ + """ Target ReadArticle of this one -> ReadArticle """ if not self.is_redirect: raise RuntimeError("Article is not a redirect") @@ -350,27 +320,25 @@ cdef class ReadArticle: ######################### cdef class FilePy: - """ - A class to represent a Zim File Reader. + """ Zim File Reader - Attributes - ---------- - *c_file : File - a pointer to a C++ File object - _filename : pathlib.Path - the file name of the File Reader object - """ + Attributes + ---------- + *c_file : File + a pointer to a C++ File object + _filename : pathlib.Path + the file name of the File Reader object """ cdef wrapper.File *c_file cdef object _filename - def __cinit__(self, object filename): - """Constructs a File from full zim file path. - Parameters - ---------- - filename : str - Full path to a zim file - """ + def __cinit__(self, object filename: pathlib.Path): + """ Constructs a File from full zim file path + + Parameters + ---------- + filename : pathlib.Path + Full path to a zim file """ self.c_file = new wrapper.File(str(filename).encode('UTF-8')) self._filename = pathlib.Path(self.c_file.getFilename().decode("UTF-8", "strict")) @@ -379,27 +347,32 @@ cdef class FilePy: if self.c_file != NULL: del self.c_file + def __enter__(self): + return self + + def __exit__(self, *args): + pass + @property - def filename(self): - """Get the filename of the File object""" + def filename(self) -> pathlib.Path: + """ Filename of the File object -> pathlib.Path """ return self._filename - def get_article(self, url): - """Get a Article with a copy of the file article by full url i.e including namespace - - Parameters - ---------- - url : str - The full url, including namespace, of the article - Returns - ------- - Article - The Article object - Raises - ------ - NotFound - If an article with the provided long url is not found in the file - """ + def get_article(self, url: str) -> ReadArticle: + """ ReadArticle with a copy of the file article by full url (including namespace) -> ReadArticle + + Parameters + ---------- + url : str + The full url, including namespace, of the article + Returns + ------- + ReadArticle + The ReadArticle object + Raises + ------ + NotFound + If an article with the provided long url is not found in the file """ # Read to a zim::Article cdef wrapper.Article art = self.c_file.getArticleByUrl(url.encode('UTF-8')) if not art.good(): @@ -408,17 +381,21 @@ cdef class FilePy: article = ReadArticle.from_read_article(art) return article - def get_metadata(self, name): - """Get a metadata - Returns - ------- - bytes - Metadata article's content. Can be of any type. - """ + def get_metadata(self, name: str) -> bytes: + """ A Metadata's content -> bytes + + Parameters + ---------- + name: str + url of the metadata article (without namespace) + Returns + ------- + bytes + Metadata article's content. Can be of any type. """ return bytes(self.get_article(f"M/{name}").content) - def get_article_by_id(self, article_id): - """Get a ZimFileArticle with a copy of the file article by article id. + def get_article_by_id(self, article_id: int) -> ReadArticle: + """ ReadArticle with a copy of the file article by article id -> ReadArticle Parameters ---------- @@ -426,15 +403,14 @@ cdef class FilePy: The id of the article Returns ------- - ZimFileArticle - The ZimFileArticle object + ReadArticle + The ReadArticle object Raises ------ RuntimeError - If there is a problem in retrieving article (ex: id is out of bound) + If there is a problem in retrieving article (id out of bound) NotFound - If an article with the provided id is not found in the file - """ + If an article with the provided id is not found in the file """ # Read to a zim::Article cdef wrapper.Article art = self.c_file.getArticle( article_id) @@ -445,14 +421,13 @@ cdef class FilePy: return article @property - def main_page_url(self): - """Get the file main page url. - Returns - ------- - str - The url of the main page - TODO Check old formats - """ + def main_page_url(self) -> str: + """ File's main page full url -> str + + Returns + ------- + str + The url of the main page, including namespace """ cdef wrapper.Fileheader header = self.c_file.getFileheader() cdef wrapper.Article article if header.hasMainPage(): @@ -467,61 +442,59 @@ cdef class FilePy: return article.getLongUrl().decode("UTF-8", "strict") @property - def checksum(self): - """Get the file checksum. - Returns - ------- - str - The file checksum - """ + def checksum(self) -> str: + """ File's checksum -> str + + Returns + ------- + str + The file's checksum """ return self.c_file.getChecksum().decode("UTF-8", "strict") @property - def article_count(self): - """Get the file article count. - Returns - ------- - int - The total number of articles from the file - """ + def article_count(self) -> int: + """ File's articles count -> int + + Returns + ------- + int + The total number of articles in the file """ return self.c_file.getCountArticles() @property def namespaces(self) -> str: - """Get the namespaces. + """ Namespaces present in the file -> str Returns ------- str - A string containing all namespaces in the file - - """ + A string containing initials of all namespaces in the file (ex: "-AIMX") """ return self.c_file.getNamespaces().decode("UTF-8", "strict") - def get_namespaces_count(self, str ns): - """Get article count from a namespaces. - Returns - ------- - int - The total number of articles from the namespace - """ + def get_namespaces_count(self, str ns) -> int: + """ Articles count within a namespace -> int + + Returns + ------- + int + The total number of articles in the namespace """ return self.c_file.getNamespaceCount(ord(ns[0])) - def suggest(self, query, start=0, end=10): - """Get an iterator of the full urls of suggested articles in the file from a title query. - Parameters - ---------- - query : str - Title query string - start : int - Iterator start (default 0) - end : end - Iterator end (default 10) - Returns - ------- - iterator - An interator with the urls of suggested articles starting from start position - """ + def suggest(self, query: str, start: int = 0, end: int = 10) -> Generator[str, None, None]: + """ Full urls of suggested articles in the file from a title query -> Generator[str, None, None] + + Parameters + ---------- + query : str + Title query string + start : int + Iterator start (default 0) + end : end + Iterator end (default 10) + Returns + ------- + Generator + Url of suggested article """ cdef unique_ptr[wrapper.Search] search = self.c_file.suggestions(query.encode('UTF-8'),start, end) cdef wrapper.search_iterator it = dereference(search).begin() @@ -529,21 +502,21 @@ cdef class FilePy: yield it.get_url().decode('UTF-8') preincrement(it) - def search(self, query, start=0, end=10): - """Get an iterator of the full urls of articles in the file from a search query. - Parameters - ---------- - query : str - Query string - start : int - Iterator start (default 0) - end : end - Iterator end (default 10) - Returns - ------- - iterator - An iterator with the urls of articles matching the search query starting from start position - """ + def search(self, query: str, start: int = 0, end: int = 10) -> Generator[str, None, None]: + """ Full urls of articles in the file from a search query -> Generator[str, None, None] + + Parameters + ---------- + query : str + Query string + start : int + Iterator start (default 0) + end : end + Iterator end (default 10) + Returns + ------- + Generator + Url of article matching the search query """ cdef unique_ptr[wrapper.Search] search = self.c_file.search(query.encode('UTF-8'),start, end) cdef wrapper.search_iterator it = dereference(search).begin() @@ -552,33 +525,33 @@ cdef class FilePy: yield it.get_url().decode('UTF-8') preincrement(it) - def get_search_results_count(self, query): - """Get search results counts for a query. - Parameters - ---------- - query : str - Query string - Returns - ------- - int - Number of search results - """ + def get_search_results_count(self, query: str) -> int: + """ Number of search results for a query -> int + + Parameters + ---------- + query : str + Query string + Returns + ------- + int + Number of search results """ cdef unique_ptr[wrapper.Search] search = self.c_file.search(query.encode('UTF-8'),0, 1) return dereference(search).get_matches_estimated() - def get_suggestions_results_count(self, query): - """Get suggestions results counts for a query. - Parameters - ---------- - query : str - Query string - Returns - ------- - int - Number of article suggestions - """ + def get_suggestions_results_count(self, query: str) -> int: + """ Number of suggestions for a query -> int + + Parameters + ---------- + query : str + Query string + Returns + ------- + int + Number of article suggestions """ cdef unique_ptr[wrapper.Search] search = self.c_file.suggestions(query.encode('UTF-8'),0 , 1) return dereference(search).get_matches_estimated() - def __repr__(self): + def __repr__(self) -> str: return f"{self.__class__.__name__}(filename={self.filename})"