diff --git a/libzim/__init__.py b/libzim/__init__.py
index b5b0eec2..6dc3f8aa 100644
--- a/libzim/__init__.py
+++ b/libzim/__init__.py
@@ -16,8 +16,3 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
-
-
-from libzim_wrapper import Blob
-
-__all__ = ["Blob"]
diff --git a/libzim/libzim_wrapper.pxd b/libzim/libzim_wrapper.pxd
index adf3a40c..c3aeba06 100644
--- a/libzim/libzim_wrapper.pxd
+++ b/libzim/libzim_wrapper.pxd
@@ -21,11 +21,16 @@ from cpython.ref cimport PyObject
from libc.stdint cimport uint32_t, uint64_t
from libcpp cimport bool
-from libcpp.memory cimport shared_ptr
+from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.string cimport string
from libcpp.vector cimport vector
+cdef extern from "zim/zim.h" namespace "zim":
+ ctypedef uint64_t size_type
+ ctypedef uint64_t offset_type
+
+
cdef extern from "zim/blob.h" namespace "zim":
cdef cppclass Blob:
Blob() except +
@@ -34,19 +39,20 @@ cdef extern from "zim/blob.h" namespace "zim":
const char* end() except +
uint64_t size() except +
+
cdef extern from "zim/writer/url.h" namespace "zim::writer":
cdef cppclass Url:
string getLongUrl() except +
cdef extern from "zim/writer/article.h" namespace "zim::writer":
- cdef cppclass Article:
- const string getTitle() except +
-
+ cdef cppclass WriterArticle:
+ pass
cdef extern from "lib.h":
- cdef cppclass ZimArticleWrapper(Article):
+ cdef cppclass ZimArticleWrapper(WriterArticle):
ZimArticleWrapper(PyObject *obj) except +
+ const string getTitle() except +
const Url getUrl() except +
const string getTitle() except +
const bool isRedirect() except +
@@ -64,3 +70,72 @@ cdef extern from "lib.h":
void finalize() nogil except +
Url getMainUrl() except +
void setMainUrl(string) except +
+
+
+cdef extern from "zim/article.h" namespace "zim":
+ cdef cppclass Article:
+ Article() except +
+
+ string getTitle() except +
+ string getUrl() except +
+ string getLongUrl() except +
+ string getMimeType() except +
+ char getNamespace() except +
+ bint good() except +
+
+ const Blob getData(size_type offset) except +
+
+ bint isRedirect() except +
+ bint isLinktarget() except +
+ bint isDeleted() except +
+
+ Article getRedirectArticle() except +
+
+
+cdef extern from "zim/fileheader.h" namespace "zim":
+ cdef cppclass Fileheader:
+ bint hasMainPage() except +
+ size_type getMainPage() except +
+
+
+cdef extern from "zim/search_iterator.h" namespace "zim":
+ cdef cppclass search_iterator:
+ search_iterator()
+ search_iterator operator++()
+ bint operator==(search_iterator)
+ bint operator!=(search_iterator)
+ string get_url()
+ string get_title()
+
+
+cdef extern from "zim/search.h" namespace "zim":
+ cdef cppclass Search:
+ Search(const File* zimfile)
+ Search(vector[const File] zimfiles)
+ search_iterator begin()
+ search_iterator end()
+ int get_matches_estimated()
+
+
+cdef extern from "zim/file.h" namespace "zim":
+ cdef cppclass File:
+ File() except +
+ File(string filename) except +
+
+ Article getArticle(size_type idx) except +
+ Article getArticle(char ns, string url) except +
+ Article getArticleByUrl(string url) except +
+
+ string getMetadata(string name) except +
+
+ Fileheader getFileheader() except +
+
+ size_type getCountArticles() except +
+ size_type getNamespaceCount(char ns) except +
+
+ string getNamespaces() except +
+ string getChecksum() except +
+ string getFilename() except +
+
+ unique_ptr[Search] search(const string query, int start, int end);
+ unique_ptr[Search] suggestions(const string query, int start, int end);
diff --git a/libzim/libzim_wrapper.pyx b/libzim/libzim_wrapper.pyx
index 21105bdc..baf98bc1 100644
--- a/libzim/libzim_wrapper.pyx
+++ b/libzim/libzim_wrapper.pyx
@@ -20,24 +20,22 @@
cimport libzim.libzim_wrapper as clibzim
-from cython.operator import dereference
+from cython.operator import dereference, preincrement
from cpython.ref cimport PyObject
+from cpython.buffer cimport PyBUF_WRITABLE
from libc.stdint cimport uint64_t
from libcpp.string cimport string
from libcpp cimport bool
-from libcpp.memory cimport shared_ptr, make_shared
+from libcpp.memory cimport shared_ptr, make_shared, unique_ptr
import datetime
-
-
-
#########################
# Blob #
#########################
-cdef class Blob:
+cdef class WritingBlob:
cdef clibzim.Blob* c_blob
cdef bytes ref_content
@@ -52,6 +50,50 @@ cdef class Blob:
if self.c_blob != NULL:
del self.c_blob
+cdef Py_ssize_t itemsize = 1
+
+cdef class ReadingBlob:
+ cdef clibzim.Blob c_blob
+ cdef Py_ssize_t size
+ cdef int view_count
+
+ cdef __setup(self, clibzim.Blob blob):
+ """Assigns an internal pointer to the wrapped C++ article object.
+
+ Parameters
+ ----------
+ *art : Article
+ Pointer to a C++ (zim::) article object
+ """
+ # Set new internal C zim.ZimArticle article
+ self.c_blob = blob
+ self.size = blob.size()
+ self.view_count = 0
+
+ def __dealloc__(self):
+ if self.view_count:
+ raise RuntimeError("Blob has views")
+
+ def __getbuffer__(self, Py_buffer *buffer, int flags):
+ if flags&PyBUF_WRITABLE:
+ raise BufferError("Cannot create writable memoryview on readonly data")
+ buffer.obj = self
+ buffer.buf = self.c_blob.data()
+ buffer.len = self.size
+ buffer.readonly = 1
+ buffer.format = 'c'
+ buffer.internal = NULL # see References
+ buffer.itemsize = itemsize
+ buffer.ndim = 1
+ buffer.shape = &self.size
+ buffer.strides = &itemsize
+ buffer.suboffsets = NULL # for pointer arrays only
+
+ self.view_count += 1
+
+ def __releasebuffer__(self, Py_buffer *buffer):
+ self.view_count -= 1
+
#------ Helper for pure virtual methods --------
@@ -76,7 +118,7 @@ cdef public api:
clibzim.Blob blob_cy_call_fct(object obj, string method, int *error) with gil:
"""Lookup and execute a pure virtual method on ZimArticle returning a Blob"""
- cdef Blob blob
+ cdef WritingBlob blob
func = get_article_method_from_object(obj, method, error)
blob = func()
@@ -161,3 +203,364 @@ cdef class Creator:
with nogil:
self.c_creator.finalize()
self._finalized = True
+
+########################
+# ReadArticle #
+########################
+
+cdef class ReadArticle:
+ """
+ A class to represent a Zim File Article.
+
+ Attributes
+ ----------
+ *c_article : Article (zim::)
+ a pointer to the C++ article object
+
+ Properties
+ -----------
+ namespace : str
+ the article namespace
+ title : str
+ the article title
+ content : str
+ the article content
+ longurl : str
+ the article long url i.e {NAMESPACE}/{redirect_url}
+ url : str
+ the article url
+ mimetype : str
+ the article mimetype
+ is_redirect : bool
+ flag if the article is a redirect
+
+ Methods
+ -------
+ from_read_article(zim.Article art)
+ Creates a python ZimArticle from a C++ zim.Article article.
+ """
+ cdef clibzim.Article c_article
+ cdef ReadingBlob _blob
+ cdef bool _haveBlob
+
+ #def __eq__(self, other):
+ # if isinstance(other, ZimArticle):
+ # return (self.longurl == other.longurl) and (self.content == other.content) and (self.is_redirect == other.is_redirect)
+ # return False
+
+ def __cinit__(self):
+ self._haveBlob = False
+
+ cdef __setup(self, clibzim.Article art):
+ """Assigns an internal pointer to the wrapped C++ article object.
+
+ Parameters
+ ----------
+ *art : Article
+ Pointer to a C++ (zim::) article object
+ """
+ # Set new internal C zim.ZimArticle article
+ self.c_article = art
+ self._blob = None
+
+
+
+ # Factory functions - Currently Cython can't use classmethods
+ @staticmethod
+ cdef from_read_article(clibzim.Article art):
+ """Creates a python ZimFileArticle from a C++ Article (zim::).
+
+ Parameters
+ ----------
+ art : Article
+ A C++ Article read with File
+ Return
+ ------
+
+ """
+ cdef ReadArticle article = ReadArticle()
+ article.__setup(art)
+ return article
+
+ @property
+ def namespace(self):
+ """Get the article's namespace"""
+ ns = self.c_article.getNamespace()
+ return chr(ns)
+
+ @property
+ def title(self):
+ """Get the article's title"""
+ return self.c_article.getTitle().decode('UTF-8')
+
+ @property
+ def content(self):
+ """Get the article's content"""
+ if not self._haveBlob:
+ self._blob = ReadingBlob()
+ self._blob.__setup(self.c_article.getData( 0))
+ self._haveBlob = True
+ return memoryview(self._blob)
+
+ @property
+ def longurl(self):
+ """Get the article's long url i.e {NAMESPACE}/{url}"""
+ return self.c_article.getLongUrl().decode("UTF-8", "strict")
+
+ @property
+ def url(self):
+ """Get the article's url"""
+ return self.c_article.getUrl().decode("UTF-8", "strict")
+
+ @property
+ def mimetype(self):
+ """Get the article's mimetype"""
+ return self.c_article.getMimeType().decode('UTF-8')
+
+ @property
+ def is_redirect(self):
+ """Get if the article is a redirect"""
+ return self.c_article.isRedirect()
+
+ def __repr__(self):
+ return f"{self.__class__.__name__}(url={self.longurl}, title=)"
+
+
+
+
+#########################
+# File #
+#########################
+
+cdef class File:
+ """
+ A class to represent a Zim File Reader.
+
+ Attributes
+ ----------
+ *c_file : File
+ a pointer to a C++ File object
+ _filename : str
+ the file name of the File Reader object
+ """
+
+ cdef clibzim.File *c_file
+ cdef object _filename
+
+ def __cinit__(self, str filename):
+ """Constructs a File from full zim file path.
+ Parameters
+ ----------
+ filename : str
+ Full path to a zim file
+ """
+
+ self.c_file = new clibzim.File(filename.encode('UTF-8'))
+ self._filename = self.c_file.getFilename().decode("UTF-8", "strict")
+
+ def __dealloc__(self):
+ if self.c_file != NULL:
+ del self.c_file
+
+ @property
+ def filename(self):
+ """Get the filename of the File object"""
+ return self._filename
+
+ def get_article(self, url):
+ """Get a Article with a copy of the file article by full url i.e including namespace
+
+ Parameters
+ ----------
+ url : str
+ The full url, including namespace, of the article
+ Returns
+ -------
+ Article
+ The Article object
+ Raises
+ ------
+ RuntimeError
+ If an article with the provided long url is not found in the file
+ """
+ # Read to a zim::Article
+ cdef clibzim.Article art = self.c_file.getArticleByUrl(url.encode('UTF-8'))
+ if not art.good():
+ raise RuntimeError("Article not found for url")
+
+ article = ReadArticle.from_read_article(art)
+ return article
+
+ def get_metadata(self, name):
+ """Get the file metadata.
+ Returns
+ -------
+ dict
+ A dictionary with the file metadata
+ """
+ article = self.get_article(f"M/{name}")
+ return article.content
+
+ def get_article_by_id(self, id):
+ """Get a ZimFileArticle with a copy of the file article by article id.
+
+ Parameters
+ ----------
+ id : int
+ The id of the article
+ Returns
+ -------
+ ZimFileArticle
+ The ZimFileArticle object
+ Raises
+ ------
+ RuntimeError
+ If an article with the provided id is not found in the file
+ """
+
+ # Read to a zim::Article
+ cdef clibzim.Article art = self.c_file.getArticle( id)
+ if not art.good():
+ raise RuntimeError("Article not found for id")
+
+ article = ReadArticle.from_read_article(art)
+ return article
+
+ @property
+ def main_page_url(self):
+ """Get the file main page url.
+ Returns
+ -------
+ str
+ The url of the main page
+ TODO Check old formats
+ """
+ cdef clibzim.Fileheader header = self.c_file.getFileheader()
+ cdef clibzim.Article article
+ if header.hasMainPage():
+ article = self.c_file.getArticle(header.getMainPage())
+ return article.getLongUrl().decode("UTF-8", "strict");
+
+ # TODO Ask about the old format, check libzim for tests
+ # Handle old zim where header has no mainPage.
+ # (We need to get first article in the zim)
+ article = self.c_file.getArticle( 0)
+ if article.good():
+ return article.getLongUrl().decode("UTF-8", "strict")
+
+ @property
+ def checksum(self):
+ """Get the file checksum.
+ Returns
+ -------
+ str
+ The file checksum
+ """
+ return self.c_file.getChecksum().decode("UTF-8", "strict")
+
+ @property
+ def article_count(self):
+ """Get the file article count.
+ Returns
+ -------
+ int
+ The total number of articles from the file
+ """
+ return self.c_file.getCountArticles()
+
+ @property
+ def namespaces(self) -> str:
+ """Get the namespaces.
+
+ Returns
+ -------
+ str
+ A string containing all namespaces in the file
+
+ """
+ return self.c_file.getNamespaces().decode("UTF-8", "strict")
+
+ def get_namespaces_count(self, str ns):
+ """Get article count from a namespaces.
+ Returns
+ -------
+ int
+ The total number of articles from the namespace
+ """
+ return self.c_file.getNamespaceCount(ord(ns[0]))
+
+ def suggest(self, query, start=0, end=10):
+ """Get an iterator of the full urls of suggested articles in the file from a title query.
+ Parameters
+ ----------
+ query : str
+ Title query string
+ start : int
+ Iterator start (default 0)
+ end : end
+ Iterator end (default 10)
+ Returns
+ -------
+ iterator
+ An interator with the urls of suggested articles starting from start position
+ """
+ cdef unique_ptr[clibzim.Search] search = self.c_file.suggestions(query.encode('UTF-8'),start, end)
+ cdef clibzim.search_iterator it = dereference(search).begin()
+
+ while it != dereference(search).end():
+ yield it.get_url().decode('UTF-8')
+ preincrement(it)
+
+ def search(self, query, start=0, end=10):
+ """Get an iterator of the full urls of articles in the file from a search query.
+ Parameters
+ ----------
+ query : str
+ Query string
+ start : int
+ Iterator start (default 0)
+ end : end
+ Iterator end (default 10)
+ Returns
+ -------
+ iterator
+ An iterator with the urls of articles matching the search query starting from start position
+ """
+
+ cdef unique_ptr[clibzim.Search] search = self.c_file.search(query.encode('UTF-8'),start, end)
+ cdef clibzim.search_iterator it = dereference(search).begin()
+
+ while it != dereference(search).end():
+ yield it.get_url().decode('UTF-8')
+ preincrement(it)
+
+ def get_search_results_count(self, query):
+ """Get search results counts for a query.
+ Parameters
+ ----------
+ query : str
+ Query string
+ Returns
+ -------
+ int
+ Number of search results
+ """
+ cdef unique_ptr[clibzim.Search] search = self.c_file.search(query.encode('UTF-8'),0, 1)
+ return dereference(search).get_matches_estimated()
+
+ def get_suggestions_results_count(self, query):
+ """Get suggestions results counts for a query.
+ Parameters
+ ----------
+ query : str
+ Query string
+ Returns
+ -------
+ int
+ Number of article suggestions
+ """
+ cdef unique_ptr[clibzim.Search] search = self.c_file.suggestions(query.encode('UTF-8'),0 , 1)
+ return dereference(search).get_matches_estimated()
+
+ def __repr__(self):
+ return f"{self.__class__.__name__}(filename={self.filename}"
diff --git a/libzim/reader.py b/libzim/reader.py
new file mode 100644
index 00000000..345f3b7d
--- /dev/null
+++ b/libzim/reader.py
@@ -0,0 +1,2 @@
+from libzim_wrapper import File
+from libzim_wrapper import ReadArticle as Article
diff --git a/libzim/writer.py b/libzim/writer.py
index a365e9f1..65a3633a 100644
--- a/libzim/writer.py
+++ b/libzim/writer.py
@@ -22,7 +22,7 @@
from collections import defaultdict
import libzim_wrapper
-from libzim_wrapper import Blob
+from libzim_wrapper import WritingBlob as Blob
__all__ = ["Article", "Blob", "Creator"]
diff --git a/run_tests.sh b/run_tests.sh
index 8b76238f..a496e6a1 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -1,5 +1,5 @@
#!/bin/bash
rm -rf tests/kiwix-test-*
-python3 tests/test_libzim.py
+pytest -v
rm -rf tests/kiwix-test-*
\ No newline at end of file
diff --git a/tests/test_libzim_file_reader.py b/tests/test_libzim_file_reader.py
new file mode 100644
index 00000000..fefc1b1b
--- /dev/null
+++ b/tests/test_libzim_file_reader.py
@@ -0,0 +1,106 @@
+import gc
+from pathlib import Path
+
+import pytest
+
+from libzim.reader import File
+
+DATA_DIR = Path(__file__).parent
+
+
+ZIMFILES = [
+ {
+ 'filename': str(DATA_DIR/"wikipedia_es_physics_mini.zim"),
+ 'checksum': u"99ea7a5598c6040c4f50b8ac0653b703",
+ 'namespaces': u"-AIMX",
+ 'article_count': 22027,
+ 'main_page_url': u"A/index",
+ }
+]
+
+
+
+
+@pytest.fixture(params=ZIMFILES)
+def zimdata(request):
+ return request.param
+
+@pytest.fixture
+def reader(zimdata):
+ return File(zimdata['filename'])
+
+
+@pytest.fixture
+def article_data():
+ return {
+ 'url': u"A/Albert_Einstein",
+ 'title': u"Albert Einstein",
+ 'mimetype':u"text/html",
+ 'article_id': 663,
+ 'size': 17343
+ }
+
+
+def test_zim_filename(reader, zimdata):
+ for k, v in zimdata.items():
+ assert getattr(reader, k) == v
+
+def test_zim_read(reader, article_data):
+ article = reader.get_article(article_data['url'])
+
+ assert article.longurl == article_data['url']
+ assert article.title == article_data['title']
+ assert article.url == article_data['url'][2:]
+ assert article.mimetype == article_data['mimetype']
+ assert isinstance(article.content, memoryview)
+ assert len(article.content) == article_data['size']
+
+def test_content_ref_keep(reader):
+ """Get the memoryview on a content and loose the reference on the article.
+ We try to load a lot of other articles to detect possible use of dandling pointer
+ """
+ content =None
+ def get_content():
+ nonlocal content
+ article = reader.get_article(u"A/Albert_Einstein")
+ assert isinstance(article.content, memoryview)
+ content = article.content
+ get_content() # Now we have a content but no reference to the article.
+ gc.collect()
+ # Load a lot of content
+ for i in range(0, reader.article_count, 2):
+ article = reader.get_article_by_id(i)
+ if not article.is_redirect:
+ c = article.content
+ # Check everything is ok
+ assert len(content) == 17343
+ assert bytes(content[:100]) == b'\n\n \n Albert Einstein