diff --git a/libzim/__init__.py b/libzim/__init__.py index b5b0eec2..6dc3f8aa 100644 --- a/libzim/__init__.py +++ b/libzim/__init__.py @@ -16,8 +16,3 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . - - -from libzim_wrapper import Blob - -__all__ = ["Blob"] diff --git a/libzim/libzim_wrapper.pxd b/libzim/libzim_wrapper.pxd index adf3a40c..c3aeba06 100644 --- a/libzim/libzim_wrapper.pxd +++ b/libzim/libzim_wrapper.pxd @@ -21,11 +21,16 @@ from cpython.ref cimport PyObject from libc.stdint cimport uint32_t, uint64_t from libcpp cimport bool -from libcpp.memory cimport shared_ptr +from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector +cdef extern from "zim/zim.h" namespace "zim": + ctypedef uint64_t size_type + ctypedef uint64_t offset_type + + cdef extern from "zim/blob.h" namespace "zim": cdef cppclass Blob: Blob() except + @@ -34,19 +39,20 @@ cdef extern from "zim/blob.h" namespace "zim": const char* end() except + uint64_t size() except + + cdef extern from "zim/writer/url.h" namespace "zim::writer": cdef cppclass Url: string getLongUrl() except + cdef extern from "zim/writer/article.h" namespace "zim::writer": - cdef cppclass Article: - const string getTitle() except + - + cdef cppclass WriterArticle: + pass cdef extern from "lib.h": - cdef cppclass ZimArticleWrapper(Article): + cdef cppclass ZimArticleWrapper(WriterArticle): ZimArticleWrapper(PyObject *obj) except + + const string getTitle() except + const Url getUrl() except + const string getTitle() except + const bool isRedirect() except + @@ -64,3 +70,72 @@ cdef extern from "lib.h": void finalize() nogil except + Url getMainUrl() except + void setMainUrl(string) except + + + +cdef extern from "zim/article.h" namespace "zim": + cdef cppclass Article: + Article() except + + + string getTitle() except + + string getUrl() except + + string getLongUrl() except + + string getMimeType() except + + char getNamespace() except + + bint good() except + + + const Blob getData(size_type offset) except + + + bint isRedirect() except + + bint isLinktarget() except + + bint isDeleted() except + + + Article getRedirectArticle() except + + + +cdef extern from "zim/fileheader.h" namespace "zim": + cdef cppclass Fileheader: + bint hasMainPage() except + + size_type getMainPage() except + + + +cdef extern from "zim/search_iterator.h" namespace "zim": + cdef cppclass search_iterator: + search_iterator() + search_iterator operator++() + bint operator==(search_iterator) + bint operator!=(search_iterator) + string get_url() + string get_title() + + +cdef extern from "zim/search.h" namespace "zim": + cdef cppclass Search: + Search(const File* zimfile) + Search(vector[const File] zimfiles) + search_iterator begin() + search_iterator end() + int get_matches_estimated() + + +cdef extern from "zim/file.h" namespace "zim": + cdef cppclass File: + File() except + + File(string filename) except + + + Article getArticle(size_type idx) except + + Article getArticle(char ns, string url) except + + Article getArticleByUrl(string url) except + + + string getMetadata(string name) except + + + Fileheader getFileheader() except + + + size_type getCountArticles() except + + size_type getNamespaceCount(char ns) except + + + string getNamespaces() except + + string getChecksum() except + + string getFilename() except + + + unique_ptr[Search] search(const string query, int start, int end); + unique_ptr[Search] suggestions(const string query, int start, int end); diff --git a/libzim/libzim_wrapper.pyx b/libzim/libzim_wrapper.pyx index 21105bdc..baf98bc1 100644 --- a/libzim/libzim_wrapper.pyx +++ b/libzim/libzim_wrapper.pyx @@ -20,24 +20,22 @@ cimport libzim.libzim_wrapper as clibzim -from cython.operator import dereference +from cython.operator import dereference, preincrement from cpython.ref cimport PyObject +from cpython.buffer cimport PyBUF_WRITABLE from libc.stdint cimport uint64_t from libcpp.string cimport string from libcpp cimport bool -from libcpp.memory cimport shared_ptr, make_shared +from libcpp.memory cimport shared_ptr, make_shared, unique_ptr import datetime - - - ######################### # Blob # ######################### -cdef class Blob: +cdef class WritingBlob: cdef clibzim.Blob* c_blob cdef bytes ref_content @@ -52,6 +50,50 @@ cdef class Blob: if self.c_blob != NULL: del self.c_blob +cdef Py_ssize_t itemsize = 1 + +cdef class ReadingBlob: + cdef clibzim.Blob c_blob + cdef Py_ssize_t size + cdef int view_count + + cdef __setup(self, clibzim.Blob blob): + """Assigns an internal pointer to the wrapped C++ article object. + + Parameters + ---------- + *art : Article + Pointer to a C++ (zim::) article object + """ + # Set new internal C zim.ZimArticle article + self.c_blob = blob + self.size = blob.size() + self.view_count = 0 + + def __dealloc__(self): + if self.view_count: + raise RuntimeError("Blob has views") + + def __getbuffer__(self, Py_buffer *buffer, int flags): + if flags&PyBUF_WRITABLE: + raise BufferError("Cannot create writable memoryview on readonly data") + buffer.obj = self + buffer.buf = self.c_blob.data() + buffer.len = self.size + buffer.readonly = 1 + buffer.format = 'c' + buffer.internal = NULL # see References + buffer.itemsize = itemsize + buffer.ndim = 1 + buffer.shape = &self.size + buffer.strides = &itemsize + buffer.suboffsets = NULL # for pointer arrays only + + self.view_count += 1 + + def __releasebuffer__(self, Py_buffer *buffer): + self.view_count -= 1 + #------ Helper for pure virtual methods -------- @@ -76,7 +118,7 @@ cdef public api: clibzim.Blob blob_cy_call_fct(object obj, string method, int *error) with gil: """Lookup and execute a pure virtual method on ZimArticle returning a Blob""" - cdef Blob blob + cdef WritingBlob blob func = get_article_method_from_object(obj, method, error) blob = func() @@ -161,3 +203,364 @@ cdef class Creator: with nogil: self.c_creator.finalize() self._finalized = True + +######################## +# ReadArticle # +######################## + +cdef class ReadArticle: + """ + A class to represent a Zim File Article. + + Attributes + ---------- + *c_article : Article (zim::) + a pointer to the C++ article object + + Properties + ----------- + namespace : str + the article namespace + title : str + the article title + content : str + the article content + longurl : str + the article long url i.e {NAMESPACE}/{redirect_url} + url : str + the article url + mimetype : str + the article mimetype + is_redirect : bool + flag if the article is a redirect + + Methods + ------- + from_read_article(zim.Article art) + Creates a python ZimArticle from a C++ zim.Article article. + """ + cdef clibzim.Article c_article + cdef ReadingBlob _blob + cdef bool _haveBlob + + #def __eq__(self, other): + # if isinstance(other, ZimArticle): + # return (self.longurl == other.longurl) and (self.content == other.content) and (self.is_redirect == other.is_redirect) + # return False + + def __cinit__(self): + self._haveBlob = False + + cdef __setup(self, clibzim.Article art): + """Assigns an internal pointer to the wrapped C++ article object. + + Parameters + ---------- + *art : Article + Pointer to a C++ (zim::) article object + """ + # Set new internal C zim.ZimArticle article + self.c_article = art + self._blob = None + + + + # Factory functions - Currently Cython can't use classmethods + @staticmethod + cdef from_read_article(clibzim.Article art): + """Creates a python ZimFileArticle from a C++ Article (zim::). + + Parameters + ---------- + art : Article + A C++ Article read with File + Return + ------ + + """ + cdef ReadArticle article = ReadArticle() + article.__setup(art) + return article + + @property + def namespace(self): + """Get the article's namespace""" + ns = self.c_article.getNamespace() + return chr(ns) + + @property + def title(self): + """Get the article's title""" + return self.c_article.getTitle().decode('UTF-8') + + @property + def content(self): + """Get the article's content""" + if not self._haveBlob: + self._blob = ReadingBlob() + self._blob.__setup(self.c_article.getData( 0)) + self._haveBlob = True + return memoryview(self._blob) + + @property + def longurl(self): + """Get the article's long url i.e {NAMESPACE}/{url}""" + return self.c_article.getLongUrl().decode("UTF-8", "strict") + + @property + def url(self): + """Get the article's url""" + return self.c_article.getUrl().decode("UTF-8", "strict") + + @property + def mimetype(self): + """Get the article's mimetype""" + return self.c_article.getMimeType().decode('UTF-8') + + @property + def is_redirect(self): + """Get if the article is a redirect""" + return self.c_article.isRedirect() + + def __repr__(self): + return f"{self.__class__.__name__}(url={self.longurl}, title=)" + + + + +######################### +# File # +######################### + +cdef class File: + """ + A class to represent a Zim File Reader. + + Attributes + ---------- + *c_file : File + a pointer to a C++ File object + _filename : str + the file name of the File Reader object + """ + + cdef clibzim.File *c_file + cdef object _filename + + def __cinit__(self, str filename): + """Constructs a File from full zim file path. + Parameters + ---------- + filename : str + Full path to a zim file + """ + + self.c_file = new clibzim.File(filename.encode('UTF-8')) + self._filename = self.c_file.getFilename().decode("UTF-8", "strict") + + def __dealloc__(self): + if self.c_file != NULL: + del self.c_file + + @property + def filename(self): + """Get the filename of the File object""" + return self._filename + + def get_article(self, url): + """Get a Article with a copy of the file article by full url i.e including namespace + + Parameters + ---------- + url : str + The full url, including namespace, of the article + Returns + ------- + Article + The Article object + Raises + ------ + RuntimeError + If an article with the provided long url is not found in the file + """ + # Read to a zim::Article + cdef clibzim.Article art = self.c_file.getArticleByUrl(url.encode('UTF-8')) + if not art.good(): + raise RuntimeError("Article not found for url") + + article = ReadArticle.from_read_article(art) + return article + + def get_metadata(self, name): + """Get the file metadata. + Returns + ------- + dict + A dictionary with the file metadata + """ + article = self.get_article(f"M/{name}") + return article.content + + def get_article_by_id(self, id): + """Get a ZimFileArticle with a copy of the file article by article id. + + Parameters + ---------- + id : int + The id of the article + Returns + ------- + ZimFileArticle + The ZimFileArticle object + Raises + ------ + RuntimeError + If an article with the provided id is not found in the file + """ + + # Read to a zim::Article + cdef clibzim.Article art = self.c_file.getArticle( id) + if not art.good(): + raise RuntimeError("Article not found for id") + + article = ReadArticle.from_read_article(art) + return article + + @property + def main_page_url(self): + """Get the file main page url. + Returns + ------- + str + The url of the main page + TODO Check old formats + """ + cdef clibzim.Fileheader header = self.c_file.getFileheader() + cdef clibzim.Article article + if header.hasMainPage(): + article = self.c_file.getArticle(header.getMainPage()) + return article.getLongUrl().decode("UTF-8", "strict"); + + # TODO Ask about the old format, check libzim for tests + # Handle old zim where header has no mainPage. + # (We need to get first article in the zim) + article = self.c_file.getArticle( 0) + if article.good(): + return article.getLongUrl().decode("UTF-8", "strict") + + @property + def checksum(self): + """Get the file checksum. + Returns + ------- + str + The file checksum + """ + return self.c_file.getChecksum().decode("UTF-8", "strict") + + @property + def article_count(self): + """Get the file article count. + Returns + ------- + int + The total number of articles from the file + """ + return self.c_file.getCountArticles() + + @property + def namespaces(self) -> str: + """Get the namespaces. + + Returns + ------- + str + A string containing all namespaces in the file + + """ + return self.c_file.getNamespaces().decode("UTF-8", "strict") + + def get_namespaces_count(self, str ns): + """Get article count from a namespaces. + Returns + ------- + int + The total number of articles from the namespace + """ + return self.c_file.getNamespaceCount(ord(ns[0])) + + def suggest(self, query, start=0, end=10): + """Get an iterator of the full urls of suggested articles in the file from a title query. + Parameters + ---------- + query : str + Title query string + start : int + Iterator start (default 0) + end : end + Iterator end (default 10) + Returns + ------- + iterator + An interator with the urls of suggested articles starting from start position + """ + cdef unique_ptr[clibzim.Search] search = self.c_file.suggestions(query.encode('UTF-8'),start, end) + cdef clibzim.search_iterator it = dereference(search).begin() + + while it != dereference(search).end(): + yield it.get_url().decode('UTF-8') + preincrement(it) + + def search(self, query, start=0, end=10): + """Get an iterator of the full urls of articles in the file from a search query. + Parameters + ---------- + query : str + Query string + start : int + Iterator start (default 0) + end : end + Iterator end (default 10) + Returns + ------- + iterator + An iterator with the urls of articles matching the search query starting from start position + """ + + cdef unique_ptr[clibzim.Search] search = self.c_file.search(query.encode('UTF-8'),start, end) + cdef clibzim.search_iterator it = dereference(search).begin() + + while it != dereference(search).end(): + yield it.get_url().decode('UTF-8') + preincrement(it) + + def get_search_results_count(self, query): + """Get search results counts for a query. + Parameters + ---------- + query : str + Query string + Returns + ------- + int + Number of search results + """ + cdef unique_ptr[clibzim.Search] search = self.c_file.search(query.encode('UTF-8'),0, 1) + return dereference(search).get_matches_estimated() + + def get_suggestions_results_count(self, query): + """Get suggestions results counts for a query. + Parameters + ---------- + query : str + Query string + Returns + ------- + int + Number of article suggestions + """ + cdef unique_ptr[clibzim.Search] search = self.c_file.suggestions(query.encode('UTF-8'),0 , 1) + return dereference(search).get_matches_estimated() + + def __repr__(self): + return f"{self.__class__.__name__}(filename={self.filename}" diff --git a/libzim/reader.py b/libzim/reader.py new file mode 100644 index 00000000..345f3b7d --- /dev/null +++ b/libzim/reader.py @@ -0,0 +1,2 @@ +from libzim_wrapper import File +from libzim_wrapper import ReadArticle as Article diff --git a/libzim/writer.py b/libzim/writer.py index a365e9f1..65a3633a 100644 --- a/libzim/writer.py +++ b/libzim/writer.py @@ -22,7 +22,7 @@ from collections import defaultdict import libzim_wrapper -from libzim_wrapper import Blob +from libzim_wrapper import WritingBlob as Blob __all__ = ["Article", "Blob", "Creator"] diff --git a/run_tests.sh b/run_tests.sh index 8b76238f..a496e6a1 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,5 +1,5 @@ #!/bin/bash rm -rf tests/kiwix-test-* -python3 tests/test_libzim.py +pytest -v rm -rf tests/kiwix-test-* \ No newline at end of file diff --git a/tests/test_libzim_file_reader.py b/tests/test_libzim_file_reader.py new file mode 100644 index 00000000..fefc1b1b --- /dev/null +++ b/tests/test_libzim_file_reader.py @@ -0,0 +1,106 @@ +import gc +from pathlib import Path + +import pytest + +from libzim.reader import File + +DATA_DIR = Path(__file__).parent + + +ZIMFILES = [ + { + 'filename': str(DATA_DIR/"wikipedia_es_physics_mini.zim"), + 'checksum': u"99ea7a5598c6040c4f50b8ac0653b703", + 'namespaces': u"-AIMX", + 'article_count': 22027, + 'main_page_url': u"A/index", + } +] + + + + +@pytest.fixture(params=ZIMFILES) +def zimdata(request): + return request.param + +@pytest.fixture +def reader(zimdata): + return File(zimdata['filename']) + + +@pytest.fixture +def article_data(): + return { + 'url': u"A/Albert_Einstein", + 'title': u"Albert Einstein", + 'mimetype':u"text/html", + 'article_id': 663, + 'size': 17343 + } + + +def test_zim_filename(reader, zimdata): + for k, v in zimdata.items(): + assert getattr(reader, k) == v + +def test_zim_read(reader, article_data): + article = reader.get_article(article_data['url']) + + assert article.longurl == article_data['url'] + assert article.title == article_data['title'] + assert article.url == article_data['url'][2:] + assert article.mimetype == article_data['mimetype'] + assert isinstance(article.content, memoryview) + assert len(article.content) == article_data['size'] + +def test_content_ref_keep(reader): + """Get the memoryview on a content and loose the reference on the article. + We try to load a lot of other articles to detect possible use of dandling pointer + """ + content =None + def get_content(): + nonlocal content + article = reader.get_article(u"A/Albert_Einstein") + assert isinstance(article.content, memoryview) + content = article.content + get_content() # Now we have a content but no reference to the article. + gc.collect() + # Load a lot of content + for i in range(0, reader.article_count, 2): + article = reader.get_article_by_id(i) + if not article.is_redirect: + c = article.content + # Check everything is ok + assert len(content) == 17343 + assert bytes(content[:100]) == b'\n\n \n Albert Einstein</ti' + +def test_get_article_by_id(reader, article_data): + return + article = reader.get_article_by_id(article_data['article_id']) + + assert article.longurl == article_data['url'] + assert article.title == article_data['title'] + assert article.url == article_data['url'][2:] + assert article.mimetype == article_data['mimetype'] + +def test_namespace_count(reader): + namespaces = reader.namespaces + num_articles = sum(reader.get_namespaces_count(ns) for ns in namespaces) + assert reader.article_count == num_articles + +def test_suggest(reader): + results = reader.suggest(u"Einstein") + assert u"A/Albert_Einstein" in list(results) + +def test_search(reader): + results = reader.search(u"Einstein") + assert len(list(results)) == 10 + + +def test_get_wrong_article(reader): + with pytest.raises(RuntimeError): + reader.get_article_by_id(reader.article_count + 100) + with pytest.raises(RuntimeError): + reader.get_article("A/I_do_not_exists")