From 4a0d6b1f2f4fbeb49a1392ff96fca20879dea416 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 22 Apr 2020 22:37:38 +0200 Subject: [PATCH 1/7] Move all python only code out of the pyx file. The `libzim_wrapper` is created. It is "internal" and is in charge to wrap the c++ part. The public (python) code is move to `__init__.py`. It uses `libzim_wrapper` to provide functionalities to the user. --- libzim/__init__.py | 194 ++++++++++++ libzim/lib.cxx | 4 +- libzim/libzim.pyx | 367 ---------------------- libzim/{libzim.pxd => libzim_wrapper.pxd} | 7 +- libzim/libzim_wrapper.pyx | 165 ++++++++++ setup.py | 8 +- tests/test_libzim.py | 4 +- 7 files changed, 371 insertions(+), 378 deletions(-) create mode 100644 libzim/__init__.py delete mode 100644 libzim/libzim.pyx rename libzim/{libzim.pxd => libzim_wrapper.pxd} (98%) create mode 100644 libzim/libzim_wrapper.pyx diff --git a/libzim/__init__.py b/libzim/__init__.py new file mode 100644 index 00000000..6287e045 --- /dev/null +++ b/libzim/__init__.py @@ -0,0 +1,194 @@ +# This file is part of python-libzim +# (see https://github.com/libzim/python-libzim) +# +# Copyright (c) 2020 Juan Diego Caballero +# Copyright (c) 2020 Matthieu Gautier +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import libzim_wrapper + +ZimBlob = libzim_wrapper.ZimBlob + +from collections import defaultdict +import datetime + +__all__ = ["ZimArticle", "ZimCreator", "ZimBlob"] + +class ZimArticle: + def __init__(self): + self._blob = None + + def get_url(self): + raise NotImplementedError + + def get_title(self): + raise NotImplementedError + + def is_redirect(self): + raise NotImplementedError + + def get_mime_type(self): + raise NotImplementedError + + def get_filename(self): + raise NotImplementedError + + def should_compress(self): + raise NotImplementedError + + def should_index(self): + raise NotImplementedError + + def redirect_url(self): + raise NotImplementedError + + def _get_data(self): + if self._blob is None: + self._blob = self.get_data() + return self._blob + + def get_data(self): + raise NotImplementedError + + +class ZimMetadataArticle(ZimArticle): + def __init__(self, url, metadata_content): + ZimArticle.__init__(self) + self.url = url + self.metadata_content = metadata_content + + def is_redirect(self): + return False + + def get_url(self): + return f"M/{self.url}" + + def get_title(self): + return "" + + def get_mime_type(self): + return "text/plain" + + def get_filename(self): + return "" + + def should_compress(self): + return True + + def should_index(self): + return False + + def get_data(self): + return ZimBlob(self.metadata_content) + + +MANDATORY_METADATA_KEYS =[ + "Name", + "Title", + "Creator", + "Publisher", + "Date", + "Description", + "Language"] + +class ZimCreator: + """ + A class to represent a Zim Creator. + + Attributes + ---------- + *c_creator : zim.ZimCreatorWrapper + a pointer to the C++ Creator object + _finalized : bool + flag if the creator was finalized + _filename : str + Zim file path + _main_page : str + Zim file main page + _index_language : str + Zim file Index language + _min_chunk_size : str + Zim file minimum chunk size + _article_counter + Zim file article counter + _metadata + Zim file metadata + """ + + def __init__(self, filename, main_page, index_language, min_chunk_size): + print(filename) + self._creatorWrapper = libzim_wrapper.ZimCreator( + filename, + main_page, + index_language, + min_chunk_size) + self.filename = filename + self.main_page = main_page + self.language = index_language + self._metadata = {} + self._article_counter = defaultdict(int) + self.update_metadata(date=datetime.date.today(), language=index_language) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def add_article(self, article): + self._creatorWrapper.add_article(article) + if not article.is_redirect(): + self._update_article_counter(article) + + def _update_article_counter(self, article): + # default dict update + self._article_counter[article.get_mime_type().strip()] += 1 + + def mandatory_metadata_ok(self): + """Flag if mandatory metadata is complete and not empty""" + metadata_item_ok = [k in self._metadata for k in MANDATORY_METADATA_KEYS] + return all(metadata_item_ok) + + + def update_metadata(self, **kwargs): + "Updates article metadata""" + # Converts python case to pascal case. example: long_description-> LongDescription + pascalize = lambda keyword: "".join(keyword.title().split("_")) + new_metadata = {pascalize(k): v for k, v in kwargs.items()} + self._metadata.update(new_metadata) + + + def write_metadata(self): + for key, value in self._metadata.items(): + if key == "date" and isinstance(value, datetime.date): + value = value.strftime("%Y-%m-%d") + article = ZimMetadataArticle(key, value) + self._creatorWrapper.add_article(article) + + article = ZimMetadataArticle("Counter", self._get_counter_string()) + self._creatorWrapper.add_article(article) + + def _get_counter_string(self): + return ";".join( + ["%s=%s" % (k,v) for (k,v) in self._article_counter.items()] + ) + + def close(self): + self.write_metadata() + self._creatorWrapper.finalize() + + def __repr__(self): + return f"ZimCreator(filename={self.filename})" diff --git a/libzim/lib.cxx b/libzim/lib.cxx index 387f3bf1..011691de 100644 --- a/libzim/lib.cxx +++ b/libzim/lib.cxx @@ -23,7 +23,7 @@ #include #include "lib.h" -#include "libzim_api.h" +#include "libzim_wrapper_api.h" #include #include @@ -38,7 +38,7 @@ ZimArticleWrapper::ZimArticleWrapper(PyObject *obj) : m_obj(obj) { - if (import_libzim()) + if (import_libzim_wrapper()) { std::cerr << "Error executing import_libzim!\n"; throw std::runtime_error("Error executing import_libzim"); diff --git a/libzim/libzim.pyx b/libzim/libzim.pyx deleted file mode 100644 index 37f6bdb7..00000000 --- a/libzim/libzim.pyx +++ /dev/null @@ -1,367 +0,0 @@ -# This file is part of python-libzim -# (see https://github.com/libzim/python-libzim) -# -# Copyright (c) 2020 Juan Diego Caballero -# Copyright (c) 2020 Matthieu Gautier -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -cimport libzim -cimport cpython.ref as cpy_ref -from cython.operator import dereference - -from libc.stdint cimport uint32_t, uint64_t -from libcpp.string cimport string -from libcpp cimport bool -from libcpp.memory cimport shared_ptr, make_shared - -import datetime -from contextlib import contextmanager -from collections import defaultdict - -######################### -# ZimBlob # -######################### - -cdef class ZimBlob: - cdef Blob* c_blob - cdef bytes ref_content - - def __cinit__(self, content): - if isinstance(content, str): - self.ref_content = content.encode('UTF-8') - else: - self.ref_content = content - self.c_blob = new Blob( self.ref_content, len(self.ref_content)) - - def __dealloc__(self): - if self.c_blob != NULL: - del self.c_blob - - -######################### -# ZimArticle # -######################### - -cdef class ZimArticle: - cdef ZimBlob blob - - def get_url(self): - raise NotImplementedError - - def get_title(self): - raise NotImplementedError - - def is_redirect(self): - raise NotImplementedError - - def get_mime_type(self): - raise NotImplementedError - - def get_filename(self): - raise NotImplementedError - - def should_compress(self): - raise NotImplementedError - - def should_index(self): - raise NotImplementedError - - def redirect_url(self): - raise NotImplementedError - - def _get_data(self): - if self.blob is None: - self.blob = self.get_data() - return self.blob - - def get_data(self): - raise NotImplementedError - - -#------ Helper for pure virtual methods -------- - -cdef get_article_method_from_object_ptr(void *ptr, string method, int *error) with gil: - cdef ZimArticle art = (ptr) - try: - func = getattr(art, method.decode('UTF-8')) - except AttributeError: - error[0] = 1 - raise - else: - error[0] = 0 - return func - -#------- ZimArticle pure virtual methods -------- - -cdef public api: - string string_cy_call_fct(void *ptr, string method, int *error) with gil: - """Lookup and execute a pure virtual method on ZimArticle returning a string""" - func = get_article_method_from_object_ptr(ptr, method, error) - ret_str = func() - return ret_str.encode('UTF-8') - - Blob blob_cy_call_fct(void *ptr, string method, int *error) with gil: - """Lookup and execute a pure virtual method on ZimArticle returning a Blob""" - cdef ZimBlob blob - - func = get_article_method_from_object_ptr(ptr, method, error) - blob = func() - return dereference(blob.c_blob) - - bool bool_cy_call_fct(void *ptr, string method, int *error) with gil: - """Lookup and execute a pure virtual method on ZimArticle returning a bool""" - func = get_article_method_from_object_ptr(ptr, method, error) - return func() - - uint64_t int_cy_call_fct(void *ptr, string method, int *error) with gil: - """Lookup and execute a pure virtual method on ZimArticle returning an int""" - func = get_article_method_from_object_ptr(ptr, method, error) - return func() - -######################### -# ZimCreator # -######################### - -#TODO Write metadata - -class ZimMetadataArticle(ZimArticle): - - def __init__(self,url, metadata_content): - ZimArticle.__init__(self) - self.url = url - self.metadata_content = metadata_content - - def is_redirect(self): - return False - - def get_url(self): - return f"M/{self.url}" - - def get_title(self): - return "" - - def get_mime_type(self): - return "text/plain" - - def get_filename(self): - return "" - - def should_compress(self): - return True - - def should_index(self): - return False - - def get_data(self): - return ZimBlob(self.metadata_content) - - -MANDATORY_METADATA_KEYS =[ - "Name", - "Title", - "Creator", - "Publisher", - "Date", - "Description", - "Language"] - # Optional - #"LongDescription", - #"Licence", - #"Tags", - #"Flavour", - #"Source", - #"Counter", - #"Scraper"] - -cdef class ZimCreator: - """ - A class to represent a Zim Creator. - - Attributes - ---------- - *c_creator : zim.ZimCreatorWrapper - a pointer to the C++ Creator object - _finalized : bool - flag if the creator was finalized - _filename : str - Zim file path - _main_page : str - Zim file main page - _index_language : str - Zim file Index language - _min_chunk_size : str - Zim file minimum chunk size - _article_counter - Zim file article counter - _metadata - Zim file metadata - """ - - cdef ZimCreatorWrapper *c_creator - cdef bool _finalized - cdef object _filename - cdef object _main_page - cdef object _index_language - cdef object _min_chunk_size - cdef object _article_counter - cdef dict __dict__ - - def __cinit__(self, str filename, str main_page = "", str index_language = "eng", min_chunk_size = 2048): - """Constructs a ZimCreator from parameters. - Parameters - ---------- - filename : str - Zim file path - main_page : str - Zim file main page - index_language : str - Zim file index language (default eng) - min_chunk_size : int - Minimum chunk size (default 2048) - """ - - self.c_creator = ZimCreatorWrapper.create(filename.encode("UTF-8"), main_page.encode("UTF-8"), index_language.encode("UTF-8"), min_chunk_size) - self._finalized = False - self._filename = filename - self._main_page = self.c_creator.getMainUrl().getLongUrl().decode("UTF-8", "strict") - self._index_language = index_language - self._min_chunk_size = min_chunk_size - self._metadata = {k:b"" for k in MANDATORY_METADATA_KEYS} - - self._article_counter = defaultdict(int) - self.update_metadata(date=datetime.date.today(), language= index_language) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.finalize() - - def __dealloc__(self): - del self.c_creator - - @property - def filename(self): - """Get the filename of the ZimCreator object""" - return self._filename - - @property - def main_page(self): - """Get the main page of the ZimCreator object""" - return self.c_creator.getMainUrl().getLongUrl().decode("UTF-8", "strict")[2:] - - @main_page.setter - def main_page(self,new_url): - """Set the main page of the ZimCreator object""" - # Check if url longformat is used - if new_url[1] == '/': - raise ValueError("Url should not include a namespace") - - self.c_creator.setMainUrl(new_url.encode('UTF-8')) - - @property - def index_language(self): - """Get the index language of the ZimCreator object""" - return self._index_language - - @property - def min_chunk_size(self): - """Get the minimum chunk size of the ZimCreator object""" - return self._min_chunk_size - - def get_article_counter_string(self): - return ";".join(["%s=%s" % (k,v) for (k,v) in self._article_counter.items()]) - - def _get_metadata(self): - metadata = self._metadata - - counter_string = self.get_article_counter_string() - if counter_string: - metadata['Counter'] = counter_string - - return metadata - - def mandatory_metadata_ok(self): - """Flag if mandatory metadata is complete and not empty""" - metadata_item_ok = [self._metadata[k] for k in MANDATORY_METADATA_KEYS] - return all(metadata_item_ok) - - def update_metadata(self, **kwargs): - "Updates article metadata""" - # Converts python case to pascal case. example: long_description-> LongDescription - pascalize = lambda keyword: "".join(keyword.title().split("_")) - - if "date" in kwargs and isinstance(kwargs['date'],datetime.date): - kwargs['date'] = kwargs['date'].strftime('%Y-%m-%d') - - new_metadata = {pascalize(key): value for key, value in kwargs.items()} - self._metadata.update(new_metadata) - - def _update_article_counter(self, ZimArticle article not None): - # default dict update - self._article_counter[article.get_mime_type().strip()] += 1 - - def add_article(self, article not None): - """Add a ZimArticle to the Creator object. - - Parameters - ---------- - article : ZimArticle - The article to add to the file - Raises - ------ - RuntimeError - If the ZimArticle provided is not ready for writing - RuntimeError - If the ZimCreator was already finalized - """ - if self._finalized: - raise RuntimeError("ZimCreator already finalized") - - # Make a shared pointer to ZimArticleWrapper from the ZimArticle object (dereference internal c_article) - cdef shared_ptr[ZimArticleWrapper] art = shared_ptr[ZimArticleWrapper]( - new ZimArticleWrapper(article)); - try: - with nogil: - self.c_creator.addArticle(art) - except: - raise - else: - if not article.is_redirect(): - self._update_article_counter(article) - - def write_metadata(self, dict metadata): - for key in metadata: - metadata_article = ZimMetadataArticle(url=key, metadata_content=metadata[key]) - self.add_article(metadata_article) - - def finalize(self): - """finalize and write added articles to the file. - - Raises - ------ - RuntimeError - If the ZimCreator was already finalized - """ - if self._finalized: - raise RuntimeError("ZimCreator already finalized") - - self.write_metadata(self._get_metadata()) - with nogil: - self.c_creator.finalize() - self._finalized = True - - def __repr__(self): - return f"{self.__class__.__name__}(filename={self.filename})" diff --git a/libzim/libzim.pxd b/libzim/libzim_wrapper.pxd similarity index 98% rename from libzim/libzim.pxd rename to libzim/libzim_wrapper.pxd index 0e1bf2bc..2b173c4a 100644 --- a/libzim/libzim.pxd +++ b/libzim/libzim_wrapper.pxd @@ -23,6 +23,7 @@ from libcpp cimport bool from libcpp.memory cimport shared_ptr from libcpp.vector cimport vector + from cpython.ref cimport PyObject cdef extern from "zim/blob.h" namespace "zim": @@ -38,12 +39,12 @@ cdef extern from "zim/writer/url.h" namespace "zim::writer": string getLongUrl() except + -cdef extern from "zim/writer/article.h" namespace "zim::writer": +cdef extern from "zim/writer/article.h" namespace "zim::writer": cdef cppclass Article: const string getTitle() except + -cdef extern from "lib.h": +cdef extern from "lib.h": cdef cppclass ZimArticleWrapper(Article): ZimArticleWrapper(PyObject *obj) except + const Url getUrl() except + @@ -55,7 +56,7 @@ cdef extern from "lib.h": const bool shouldIndex() except + const Url getRedirectUrl() except + const Blob getData() except + - + cdef cppclass ZimCreatorWrapper: @staticmethod ZimCreatorWrapper *create(string fileName, string mainPage, string fullTextIndexLanguage, int minChunkSize) nogil except + diff --git a/libzim/libzim_wrapper.pyx b/libzim/libzim_wrapper.pyx new file mode 100644 index 00000000..bdee4557 --- /dev/null +++ b/libzim/libzim_wrapper.pyx @@ -0,0 +1,165 @@ +# This file is part of python-libzim +# (see https://github.com/libzim/python-libzim) +# +# Copyright (c) 2020 Juan Diego Caballero +# Copyright (c) 2020 Matthieu Gautier +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +cimport libzim.libzim_wrapper as clibzim +from cython.operator import dereference + +from libc.stdint cimport uint64_t +from libcpp.string cimport string +from libcpp cimport bool +from libcpp.memory cimport shared_ptr, make_shared + +import datetime + +from cpython.ref cimport PyObject + +######################### +# ZimBlob # +######################### + +cdef class ZimBlob: + cdef clibzim.Blob* c_blob + cdef bytes ref_content + + def __cinit__(self, content): + if isinstance(content, str): + self.ref_content = content.encode('UTF-8') + else: + self.ref_content = content + self.c_blob = new clibzim.Blob( self.ref_content, len(self.ref_content)) + + def __dealloc__(self): + if self.c_blob != NULL: + del self.c_blob + + +######################### +# ZimArticle # +######################### + +#------ Helper for pure virtual methods -------- + +cdef get_article_method_from_object(object obj, string method, int *error) with gil: + try: + func = getattr(obj, method.decode('UTF-8')) + except AttributeError: + error[0] = 1 + raise + else: + error[0] = 0 + return func + +#------- ZimArticle pure virtual methods -------- + +cdef public api: + string string_cy_call_fct(object obj, string method, int *error) with gil: + """Lookup and execute a pure virtual method on ZimArticle returning a string""" + func = get_article_method_from_object(obj, method, error) + ret_str = func() + return ret_str.encode('UTF-8') + + clibzim.Blob blob_cy_call_fct(object obj, string method, int *error) with gil: + """Lookup and execute a pure virtual method on ZimArticle returning a Blob""" + cdef ZimBlob blob + + func = get_article_method_from_object(obj, method, error) + blob = func() + return dereference(blob.c_blob) + + bool bool_cy_call_fct(object obj, string method, int *error) with gil: + """Lookup and execute a pure virtual method on ZimArticle returning a bool""" + func = get_article_method_from_object(obj, method, error) + return func() + + uint64_t int_cy_call_fct(object obj, string method, int *error) with gil: + """Lookup and execute a pure virtual method on ZimArticle returning an int""" + func = get_article_method_from_object(obj, method, error) + return func() + +cdef class ZimCreator: + """ + A class to represent a Zim Creator. + + Attributes + ---------- + *c_creator : zim.ZimCreator + a pointer to the C++ Creator object + _finalized : bool + flag if the creator was finalized + """ + + cdef clibzim.ZimCreatorWrapper *c_creator + cdef bool _finalized + + def __cinit__(self, str filename, str main_page = "", str index_language = "eng", min_chunk_size = 2048): + """Constructs a ZimCreator from parameters. + Parameters + ---------- + filename : str + Zim file path + main_page : str + Zim file main page + index_language : str + Zim file index language (default eng) + min_chunk_size : int + Minimum chunk size (default 2048) + """ + + self.c_creator = clibzim.ZimCreatorWrapper.create(filename.encode("UTF-8"), main_page.encode("UTF-8"), index_language.encode("UTF-8"), min_chunk_size) + self._finalized = False + + def __dealloc__(self): + del self.c_creator + + def add_article(self, article not None): + """Add a article to the Creator object. + + Parameters + ---------- + article : ZimArticle + The article to add to the file + Raises + ------ + RuntimeError + If the ZimCreator was already finalized + """ + if self._finalized: + raise RuntimeError("ZimCreator already finalized") + + # Make a shared pointer to ZimArticleWrapper from the ZimArticle object + cdef shared_ptr[clibzim.ZimArticleWrapper] art = shared_ptr[clibzim.ZimArticleWrapper]( + new clibzim.ZimArticleWrapper(article)); + with nogil: + self.c_creator.addArticle(art) + + def finalize(self): + """finalize and write added articles to the file. + + Raises + ------ + RuntimeError + If the ZimCreator was already finalized + """ + if self._finalized: + raise RuntimeError("ZimCreator already finalized") + with nogil: + self.c_creator.finalize() + self._finalized = True + diff --git a/setup.py b/setup.py index eb4a7df5..8a05d11c 100755 --- a/setup.py +++ b/setup.py @@ -19,12 +19,12 @@ def read(fname): license = "GPLv3+", long_description=read('README.md'), ext_modules = cythonize([ - Extension("libzim", ["libzim/*.pyx","libzim/lib.cxx"], + Extension("libzim_wrapper", ["libzim/*.pyx", "libzim/lib.cxx"], include_dirs=["libzim"], libraries=["zim"], extra_compile_args=["-std=c++11"], language="c++"), - ], - compiler_directives={'language_level' : "3"} - ) + ], + compiler_directives={'language_level' : "3"} + ), ) diff --git a/tests/test_libzim.py b/tests/test_libzim.py index d2967dca..3eb9717c 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -1,7 +1,7 @@ # This file is part of python-libzim # (see https://github.com/libzim/python-libzim) # -i# Copyright (c) 2020 Juan Diego Caballero +# Copyright (c) 2020 Juan Diego Caballero # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -120,7 +120,7 @@ def test_article_metadata(self): rnd_str = str(uuid.uuid1()) zim_creator = ZimCreator(self.test_zim_file_path + '-' + rnd_str + '.zim',main_page = "welcome",index_language= "eng", min_chunk_size= 2048) zim_creator.update_metadata(**TEST_METADATA) - self.assertEqual(zim_creator._get_metadata(), TEST_METADATA) + self.assertEqual(zim_creator._metadata, TEST_METADATA) def test_check_mandatory_metadata(self): import uuid From e17d08b6f127485ee906f430d37d35ad1ea77a2e Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 5 May 2020 11:02:11 +0200 Subject: [PATCH 2/7] Add .gitignore --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..020ff6f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__ +build +libzim_wrapper.*.so +libzim/libzim_wrapper.cpp +libzim/libzim_wrapper.h +libzim/libzim_wrapper_api.h +*.egg-info From 7969ada79e440003776f0dc572002e481338013f Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 5 May 2020 11:39:32 +0200 Subject: [PATCH 3/7] Do not write test in /opt --- tests/test_libzim.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_libzim.py b/tests/test_libzim.py index 3eb9717c..8b0eabb0 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -92,7 +92,7 @@ def get_data(self): class TestZimCreator(unittest.TestCase): def setUp(self): - self.test_zim_file_path = "/opt/python-libzim/tests/kiwix-test" + self.test_zim_file_path = "/tmp/python-libzim/tests/kiwix-test" # Test article self.test_article = ZimTestArticle() @@ -113,7 +113,7 @@ def test_write_article(self): zim_creator.add_article(self.test_article) # Set mandatory metadata zim_creator.update_metadata(creator='python-libzim',description='Created in python',name='Hola',publisher='Monadical',title='Test Zim') - zim_creator.finalize() + zim_creator.close() def test_article_metadata(self): import uuid @@ -121,6 +121,7 @@ def test_article_metadata(self): zim_creator = ZimCreator(self.test_zim_file_path + '-' + rnd_str + '.zim',main_page = "welcome",index_language= "eng", min_chunk_size= 2048) zim_creator.update_metadata(**TEST_METADATA) self.assertEqual(zim_creator._metadata, TEST_METADATA) + zim_creator.close() def test_check_mandatory_metadata(self): import uuid @@ -129,6 +130,7 @@ def test_check_mandatory_metadata(self): self.assertFalse(zim_creator.mandatory_metadata_ok()) zim_creator.update_metadata(creator='python-libzim',description='Created in python',name='Hola',publisher='Monadical',title='Test Zim') self.assertTrue(zim_creator.mandatory_metadata_ok()) + zim_creator.close() From ce7170e8f47c88a9bfe327c45d7ee124eda1179a Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 5 May 2020 12:46:18 +0200 Subject: [PATCH 4/7] Use pytest for the unit tests. --- tests/test_libzim.py | 185 +++++++++++++++++++++++-------------------- 1 file changed, 98 insertions(+), 87 deletions(-) diff --git a/tests/test_libzim.py b/tests/test_libzim.py index 8b0eabb0..daf7ccde 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -16,50 +16,56 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import unittest +import pytest import os,sys,inspect -# Import local libzim module from parent -current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) -parent_dir = os.path.dirname(current_dir) -sys.path.insert(0, parent_dir) - from libzim import ZimArticle, ZimBlob, ZimCreator # test files https://wiki.kiwix.org/wiki/Content_in_all_languages # https://wiki.openzim.org/wiki/Metadata - -TEST_METADATA = { - # Mandatory - "Name" : "wikipedia_fr_football", - "Title": "English Wikipedia", - "Creator": "English speaking Wikipedia contributors", - "Publisher": "Wikipedia user Foobar", - "Date": "2009-11-21", - "Description": "All articles (without images) from the english Wikipedia", - "Language": "eng", - # Optional - "Longdescription": "This ZIM file contains all articles (without images) from the english Wikipedia by 2009-11-10. The topics are ...", - "Licence": "CC-BY", - "Tags": "wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:yes;_ftindex:yes", - "Flavour": "nopic", - "Source": "https://en.wikipedia.org/", - "Counter": "image/jpeg=5;image/gif=3;image/png=2", - "Scraper": "sotoki 1.2.3" -} +@pytest.fixture(scope="session") +def metadata(): + return { + # Mandatory + "Name" : "wikipedia_fr_football", + "Title": "English Wikipedia", + "Creator": "English speaking Wikipedia contributors", + "Publisher": "Wikipedia user Foobar", + "Date": "2009-11-21", + "Description": "All articles (without images) from the english Wikipedia", + "Language": "eng", + # Optional + "Longdescription": "This ZIM file contains all articles (without images) from the english Wikipedia by 2009-11-10. The topics are ...", + "Licence": "CC-BY", + "Tags": "wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:yes;_ftindex:yes", + "Flavour": "nopic", + "Source": "https://en.wikipedia.org/", + "Counter": "image/jpeg=5;image/gif=3;image/png=2", + "Scraper": "sotoki 1.2.3" + } + +@pytest.fixture(scope="session") +def article_content(): + content = ''' + + + Monadical + +

ñññ Hello, it works ñññ

''' + url = "A/Monadical_SAS" + title = "Monadical SAS" + mime_type = "text/html" + return (content, url, title, mime_type) class ZimTestArticle(ZimArticle): - content = ''' - - - Monadical - -

ñññ Hello, it works ñññ

''' - - def __init__(self): + def __init__(self, content, url, title, mime_type): ZimArticle.__init__(self) + self.content = content + self.url = url + self.title = title + self.mime_type = mime_type def is_redirect(self): return False @@ -69,17 +75,17 @@ def can_write(self): return True def get_url(self): - return "A/Monadical_SAS" + return self.url def get_title(self): - return "Monadical SAS" - + return self.title + def get_mime_type(self): - return "text/html" - + return self.mime_type + def get_filename(self): return "" - + def should_compress(self): return True @@ -88,51 +94,56 @@ def should_index(self): def get_data(self): return ZimBlob(self.content.encode('UTF-8')) - - -class TestZimCreator(unittest.TestCase): - def setUp(self): - self.test_zim_file_path = "/tmp/python-libzim/tests/kiwix-test" - - # Test article - self.test_article = ZimTestArticle() - - def tearDown(self): - pass - - def _assert_article_properties(self, written_article, article): - pass - - def _add_article_to_test_zim_file_read_it_back(self, article, delete_zim_file=True): - pass - - def test_write_article(self): - import uuid - rnd_str = str(uuid.uuid1()) - zim_creator = ZimCreator(self.test_zim_file_path + '-' + rnd_str + '.zim',main_page = "welcome",index_language= "eng", min_chunk_size= 2048) - zim_creator.add_article(self.test_article) - # Set mandatory metadata - zim_creator.update_metadata(creator='python-libzim',description='Created in python',name='Hola',publisher='Monadical',title='Test Zim') - zim_creator.close() - - def test_article_metadata(self): - import uuid - rnd_str = str(uuid.uuid1()) - zim_creator = ZimCreator(self.test_zim_file_path + '-' + rnd_str + '.zim',main_page = "welcome",index_language= "eng", min_chunk_size= 2048) - zim_creator.update_metadata(**TEST_METADATA) - self.assertEqual(zim_creator._metadata, TEST_METADATA) - zim_creator.close() - - def test_check_mandatory_metadata(self): - import uuid - rnd_str = str(uuid.uuid1()) - zim_creator = ZimCreator(self.test_zim_file_path + '-' + rnd_str + '.zim',main_page = "welcome",index_language= "eng", min_chunk_size= 2048) - self.assertFalse(zim_creator.mandatory_metadata_ok()) - zim_creator.update_metadata(creator='python-libzim',description='Created in python',name='Hola',publisher='Monadical',title='Test Zim') - self.assertTrue(zim_creator.mandatory_metadata_ok()) - zim_creator.close() - - - -if __name__ == '__main__': - unittest.main() + +@pytest.fixture(scope="session") +def article(article_content): + return ZimTestArticle(*article_content) + + +def test_write_article(tmpdir, article): + zim_creator = ZimCreator( + str(tmpdir/"test.zim"), + main_page="welcome", + index_language="eng", + min_chunk_size=2048 + ) + zim_creator.add_article(article) + zim_creator.update_metadata( + creator='python-libzim', + description='Created in python', + name='Hola', + publisher='Monadical', + title='Test Zim' + ) + zim_creator.close() + + +def test_article_metadata(tmpdir, metadata): + zim_creator = ZimCreator( + str(tmpdir/"test.zim"), + main_page = "welcome", + index_language= "eng", + min_chunk_size= 2048 + ) + zim_creator.update_metadata(**metadata) + assert zim_creator._metadata == metadata + zim_creator.close() + + +def test_check_mandatory_metadata(tmpdir): + zim_creator = ZimCreator( + str(tmpdir/"test.zim"), + main_page = "welcome", + index_language= "eng", + min_chunk_size= 2048 + ) + assert not zim_creator.mandatory_metadata_ok() + zim_creator.update_metadata( + creator='python-libzim', + description='Created in python', + name='Hola', + publisher='Monadical', + title='Test Zim' + ) + assert zim_creator.mandatory_metadata_ok() + zim_creator.close() From 8ebad9d90e05a22e13579b69af108e5a7082b670 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 5 May 2020 14:01:01 +0200 Subject: [PATCH 5/7] Use ZimCreator as contextmanager in tests. --- tests/test_libzim.py | 73 +++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/tests/test_libzim.py b/tests/test_libzim.py index daf7ccde..d9290a1e 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -101,49 +101,46 @@ def article(article_content): def test_write_article(tmpdir, article): - zim_creator = ZimCreator( - str(tmpdir/"test.zim"), + with ZimCreator( + str(tmpdir / "test.zim"), main_page="welcome", index_language="eng", - min_chunk_size=2048 - ) - zim_creator.add_article(article) - zim_creator.update_metadata( - creator='python-libzim', - description='Created in python', - name='Hola', - publisher='Monadical', - title='Test Zim' - ) - zim_creator.close() + min_chunk_size=2048, + ) as zim_creator: + zim_creator.add_article(article) + zim_creator.update_metadata( + creator="python-libzim", + description="Created in python", + name="Hola", + publisher="Monadical", + title="Test Zim", + ) def test_article_metadata(tmpdir, metadata): - zim_creator = ZimCreator( - str(tmpdir/"test.zim"), - main_page = "welcome", - index_language= "eng", - min_chunk_size= 2048 - ) - zim_creator.update_metadata(**metadata) - assert zim_creator._metadata == metadata - zim_creator.close() + with ZimCreator( + str(tmpdir / "test.zim"), + main_page="welcome", + index_language="eng", + min_chunk_size=2048, + ) as zim_creator: + zim_creator.update_metadata(**metadata) + assert zim_creator._metadata == metadata def test_check_mandatory_metadata(tmpdir): - zim_creator = ZimCreator( - str(tmpdir/"test.zim"), - main_page = "welcome", - index_language= "eng", - min_chunk_size= 2048 - ) - assert not zim_creator.mandatory_metadata_ok() - zim_creator.update_metadata( - creator='python-libzim', - description='Created in python', - name='Hola', - publisher='Monadical', - title='Test Zim' - ) - assert zim_creator.mandatory_metadata_ok() - zim_creator.close() + with ZimCreator( + str(tmpdir / "test.zim"), + main_page="welcome", + index_language="eng", + min_chunk_size=2048, + ) as zim_creator: + assert not zim_creator.mandatory_metadata_ok() + zim_creator.update_metadata( + creator="python-libzim", + description="Created in python", + name="Hola", + publisher="Monadical", + title="Test Zim", + ) + assert zim_creator.mandatory_metadata_ok() From 7900d3e548ef17063af630c086cce3ecf1cbd6c3 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 5 May 2020 14:04:48 +0200 Subject: [PATCH 6/7] Reformat code using black and isort. --- libzim/__init__.py | 28 ++++++++++---------- libzim/examples.py | 54 +++++++++++++++++++++++++-------------- libzim/libzim_wrapper.pxd | 6 ++--- libzim/libzim_wrapper.pyx | 6 +++-- tests/test_libzim.py | 14 +++++----- 5 files changed, 63 insertions(+), 45 deletions(-) diff --git a/libzim/__init__.py b/libzim/__init__.py index 6287e045..c3313053 100644 --- a/libzim/__init__.py +++ b/libzim/__init__.py @@ -18,14 +18,16 @@ # along with this program. If not, see . +import datetime +from collections import defaultdict + import libzim_wrapper ZimBlob = libzim_wrapper.ZimBlob -from collections import defaultdict -import datetime -__all__ = ["ZimArticle", "ZimCreator", "ZimBlob"] +__all__ = ["ZimArticle", "ZimCreator", "ZimBlob"] + class ZimArticle: def __init__(self): @@ -95,14 +97,16 @@ def get_data(self): return ZimBlob(self.metadata_content) -MANDATORY_METADATA_KEYS =[ +MANDATORY_METADATA_KEYS = [ "Name", "Title", "Creator", "Publisher", "Date", "Description", - "Language"] + "Language", +] + class ZimCreator: """ @@ -131,10 +135,8 @@ class ZimCreator: def __init__(self, filename, main_page, index_language, min_chunk_size): print(filename) self._creatorWrapper = libzim_wrapper.ZimCreator( - filename, - main_page, - index_language, - min_chunk_size) + filename, main_page, index_language, min_chunk_size + ) self.filename = filename self.main_page = main_page self.language = index_language @@ -162,15 +164,13 @@ def mandatory_metadata_ok(self): metadata_item_ok = [k in self._metadata for k in MANDATORY_METADATA_KEYS] return all(metadata_item_ok) - def update_metadata(self, **kwargs): - "Updates article metadata""" + "Updates article metadata" "" # Converts python case to pascal case. example: long_description-> LongDescription pascalize = lambda keyword: "".join(keyword.title().split("_")) new_metadata = {pascalize(k): v for k, v in kwargs.items()} self._metadata.update(new_metadata) - def write_metadata(self): for key, value in self._metadata.items(): if key == "date" and isinstance(value, datetime.date): @@ -182,9 +182,7 @@ def write_metadata(self): self._creatorWrapper.add_article(article) def _get_counter_string(self): - return ";".join( - ["%s=%s" % (k,v) for (k,v) in self._article_counter.items()] - ) + return ";".join(["%s=%s" % (k, v) for (k, v) in self._article_counter.items()]) def close(self): self.write_metadata() diff --git a/libzim/examples.py b/libzim/examples.py index 132c7dd9..50052792 100644 --- a/libzim/examples.py +++ b/libzim/examples.py @@ -17,10 +17,13 @@ # along with this program. If not, see . +# Write the article +import uuid + from libzim import ZimArticle, ZimBlob, ZimCreator -class ZimTestArticle(ZimArticle): +class ZimTestArticle(ZimArticle): def __init__(self, url, title, content): ZimArticle.__init__(self) self.url = url @@ -35,13 +38,13 @@ def get_url(self): def get_title(self): return f"{self.title}" - + def get_mime_type(self): return "text/html" - + def get_filename(self): return "" - + def should_compress(self): return True @@ -51,34 +54,39 @@ def should_index(self): def get_data(self): return ZimBlob(self.content) + # Create a ZimTestArticle article -content = ''' +content = """ Monadical -

ñññ Hello, it works ñññ

''' +

ñññ Hello, it works ñññ

""" -content2 = ''' +content2 = """ Monadical 2 -

ñññ Hello, it works 2 ñññ

''' +

ñññ Hello, it works 2 ñññ

""" article = ZimTestArticle("Monadical_SAS", "Monadical", content) article2 = ZimTestArticle("Monadical_2", "Monadical 2", content2) print(article.content) -# Write the article -import uuid -rnd_str = str(uuid.uuid1()) + +rnd_str = str(uuid.uuid1()) test_zim_file_path = "/opt/python-libzim/tests/kiwix-test" -zim_creator = ZimCreator(test_zim_file_path + '-' + rnd_str + '.zim',main_page = "Monadical",index_language= "eng", min_chunk_size= 2048) +zim_creator = ZimCreator( + test_zim_file_path + "-" + rnd_str + ".zim", + main_page="Monadical", + index_language="eng", + min_chunk_size=2048, +) # Add articles to zim file zim_creator.add_article(article) @@ -86,7 +94,13 @@ def get_data(self): # Set mandatory metadata if not zim_creator.mandatory_metadata_ok(): - zim_creator.update_metadata(creator='python-libzim',description='Created in python',name='Hola',publisher='Monadical',title='Test Zim') + zim_creator.update_metadata( + creator="python-libzim", + description="Created in python", + name="Hola", + publisher="Monadical", + title="Test Zim", + ) print(zim_creator._get_metadata()) @@ -98,11 +112,13 @@ def get_data(self): rnd_str = str(uuid.uuid1()) -with ZimCreator(test_zim_file_path + '-' + rnd_str + '.zim') as zc: +with ZimCreator(test_zim_file_path + "-" + rnd_str + ".zim") as zc: zc.add_article(article) zc.add_article(article2) - zc.update_metadata(creator='python-libzim', - description='Created in python', - name='Hola',publisher='Monadical', - title='Test Zim') - + zc.update_metadata( + creator="python-libzim", + description="Created in python", + name="Hola", + publisher="Monadical", + title="Test Zim", + ) diff --git a/libzim/libzim_wrapper.pxd b/libzim/libzim_wrapper.pxd index 2b173c4a..adf3a40c 100644 --- a/libzim/libzim_wrapper.pxd +++ b/libzim/libzim_wrapper.pxd @@ -17,15 +17,15 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from libcpp.string cimport string +from cpython.ref cimport PyObject + from libc.stdint cimport uint32_t, uint64_t from libcpp cimport bool from libcpp.memory cimport shared_ptr +from libcpp.string cimport string from libcpp.vector cimport vector -from cpython.ref cimport PyObject - cdef extern from "zim/blob.h" namespace "zim": cdef cppclass Blob: Blob() except + diff --git a/libzim/libzim_wrapper.pyx b/libzim/libzim_wrapper.pyx index bdee4557..c7dac535 100644 --- a/libzim/libzim_wrapper.pyx +++ b/libzim/libzim_wrapper.pyx @@ -19,7 +19,9 @@ cimport libzim.libzim_wrapper as clibzim + from cython.operator import dereference +from cpython.ref cimport PyObject from libc.stdint cimport uint64_t from libcpp.string cimport string @@ -28,7 +30,8 @@ from libcpp.memory cimport shared_ptr, make_shared import datetime -from cpython.ref cimport PyObject + + ######################### # ZimBlob # @@ -162,4 +165,3 @@ cdef class ZimCreator: with nogil: self.c_creator.finalize() self._finalized = True - diff --git a/tests/test_libzim.py b/tests/test_libzim.py index d9290a1e..73a9c577 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -17,7 +17,6 @@ # along with this program. If not, see . import pytest -import os,sys,inspect from libzim import ZimArticle, ZimBlob, ZimCreator @@ -29,7 +28,7 @@ def metadata(): return { # Mandatory - "Name" : "wikipedia_fr_football", + "Name": "wikipedia_fr_football", "Title": "English Wikipedia", "Creator": "English speaking Wikipedia contributors", "Publisher": "Wikipedia user Foobar", @@ -43,22 +42,24 @@ def metadata(): "Flavour": "nopic", "Source": "https://en.wikipedia.org/", "Counter": "image/jpeg=5;image/gif=3;image/png=2", - "Scraper": "sotoki 1.2.3" + "Scraper": "sotoki 1.2.3", } + @pytest.fixture(scope="session") def article_content(): - content = ''' + content = """ Monadical -

ñññ Hello, it works ñññ

''' +

ñññ Hello, it works ñññ

""" url = "A/Monadical_SAS" title = "Monadical SAS" mime_type = "text/html" return (content, url, title, mime_type) + class ZimTestArticle(ZimArticle): def __init__(self, content, url, title, mime_type): ZimArticle.__init__(self) @@ -93,7 +94,8 @@ def should_index(self): return True def get_data(self): - return ZimBlob(self.content.encode('UTF-8')) + return ZimBlob(self.content.encode("UTF-8")) + @pytest.fixture(scope="session") def article(article_content): From 2fd3a98ac8fa353043c14cfdb03f8a85e3db7a15 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 5 May 2020 14:37:41 +0200 Subject: [PATCH 7/7] Move public writer classes in a writer sub-module. Fix #17 --- libzim/__init__.py | 173 +--------------------------------- libzim/examples.py | 18 ++-- libzim/libzim_wrapper.pyx | 12 +-- libzim/writer.py | 190 ++++++++++++++++++++++++++++++++++++++ tests/test_libzim.py | 16 ++-- 5 files changed, 213 insertions(+), 196 deletions(-) create mode 100644 libzim/writer.py diff --git a/libzim/__init__.py b/libzim/__init__.py index c3313053..b5b0eec2 100644 --- a/libzim/__init__.py +++ b/libzim/__init__.py @@ -18,175 +18,6 @@ # along with this program. If not, see . -import datetime -from collections import defaultdict +from libzim_wrapper import Blob -import libzim_wrapper - -ZimBlob = libzim_wrapper.ZimBlob - - -__all__ = ["ZimArticle", "ZimCreator", "ZimBlob"] - - -class ZimArticle: - def __init__(self): - self._blob = None - - def get_url(self): - raise NotImplementedError - - def get_title(self): - raise NotImplementedError - - def is_redirect(self): - raise NotImplementedError - - def get_mime_type(self): - raise NotImplementedError - - def get_filename(self): - raise NotImplementedError - - def should_compress(self): - raise NotImplementedError - - def should_index(self): - raise NotImplementedError - - def redirect_url(self): - raise NotImplementedError - - def _get_data(self): - if self._blob is None: - self._blob = self.get_data() - return self._blob - - def get_data(self): - raise NotImplementedError - - -class ZimMetadataArticle(ZimArticle): - def __init__(self, url, metadata_content): - ZimArticle.__init__(self) - self.url = url - self.metadata_content = metadata_content - - def is_redirect(self): - return False - - def get_url(self): - return f"M/{self.url}" - - def get_title(self): - return "" - - def get_mime_type(self): - return "text/plain" - - def get_filename(self): - return "" - - def should_compress(self): - return True - - def should_index(self): - return False - - def get_data(self): - return ZimBlob(self.metadata_content) - - -MANDATORY_METADATA_KEYS = [ - "Name", - "Title", - "Creator", - "Publisher", - "Date", - "Description", - "Language", -] - - -class ZimCreator: - """ - A class to represent a Zim Creator. - - Attributes - ---------- - *c_creator : zim.ZimCreatorWrapper - a pointer to the C++ Creator object - _finalized : bool - flag if the creator was finalized - _filename : str - Zim file path - _main_page : str - Zim file main page - _index_language : str - Zim file Index language - _min_chunk_size : str - Zim file minimum chunk size - _article_counter - Zim file article counter - _metadata - Zim file metadata - """ - - def __init__(self, filename, main_page, index_language, min_chunk_size): - print(filename) - self._creatorWrapper = libzim_wrapper.ZimCreator( - filename, main_page, index_language, min_chunk_size - ) - self.filename = filename - self.main_page = main_page - self.language = index_language - self._metadata = {} - self._article_counter = defaultdict(int) - self.update_metadata(date=datetime.date.today(), language=index_language) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def add_article(self, article): - self._creatorWrapper.add_article(article) - if not article.is_redirect(): - self._update_article_counter(article) - - def _update_article_counter(self, article): - # default dict update - self._article_counter[article.get_mime_type().strip()] += 1 - - def mandatory_metadata_ok(self): - """Flag if mandatory metadata is complete and not empty""" - metadata_item_ok = [k in self._metadata for k in MANDATORY_METADATA_KEYS] - return all(metadata_item_ok) - - def update_metadata(self, **kwargs): - "Updates article metadata" "" - # Converts python case to pascal case. example: long_description-> LongDescription - pascalize = lambda keyword: "".join(keyword.title().split("_")) - new_metadata = {pascalize(k): v for k, v in kwargs.items()} - self._metadata.update(new_metadata) - - def write_metadata(self): - for key, value in self._metadata.items(): - if key == "date" and isinstance(value, datetime.date): - value = value.strftime("%Y-%m-%d") - article = ZimMetadataArticle(key, value) - self._creatorWrapper.add_article(article) - - article = ZimMetadataArticle("Counter", self._get_counter_string()) - self._creatorWrapper.add_article(article) - - def _get_counter_string(self): - return ";".join(["%s=%s" % (k, v) for (k, v) in self._article_counter.items()]) - - def close(self): - self.write_metadata() - self._creatorWrapper.finalize() - - def __repr__(self): - return f"ZimCreator(filename={self.filename})" +__all__ = ["Blob"] diff --git a/libzim/examples.py b/libzim/examples.py index 50052792..9556d9b4 100644 --- a/libzim/examples.py +++ b/libzim/examples.py @@ -20,12 +20,12 @@ # Write the article import uuid -from libzim import ZimArticle, ZimBlob, ZimCreator +from libzim.writer import Article, Blob, Creator -class ZimTestArticle(ZimArticle): +class TestArticle(Article): def __init__(self, url, title, content): - ZimArticle.__init__(self) + Article.__init__(self) self.url = url self.title = title self.content = content @@ -52,10 +52,10 @@ def should_index(self): return True def get_data(self): - return ZimBlob(self.content) + return Blob(self.content) -# Create a ZimTestArticle article +# Create a TestArticle article content = """ @@ -71,8 +71,8 @@ def get_data(self):

ñññ Hello, it works 2 ñññ

""" -article = ZimTestArticle("Monadical_SAS", "Monadical", content) -article2 = ZimTestArticle("Monadical_2", "Monadical 2", content2) +article = TestArticle("Monadical_SAS", "Monadical", content) +article2 = TestArticle("Monadical_2", "Monadical 2", content2) print(article.content) @@ -81,7 +81,7 @@ def get_data(self): test_zim_file_path = "/opt/python-libzim/tests/kiwix-test" -zim_creator = ZimCreator( +zim_creator = Creator( test_zim_file_path + "-" + rnd_str + ".zim", main_page="Monadical", index_language="eng", @@ -112,7 +112,7 @@ def get_data(self): rnd_str = str(uuid.uuid1()) -with ZimCreator(test_zim_file_path + "-" + rnd_str + ".zim") as zc: +with Creator(test_zim_file_path + "-" + rnd_str + ".zim") as zc: zc.add_article(article) zc.add_article(article2) zc.update_metadata( diff --git a/libzim/libzim_wrapper.pyx b/libzim/libzim_wrapper.pyx index c7dac535..21105bdc 100644 --- a/libzim/libzim_wrapper.pyx +++ b/libzim/libzim_wrapper.pyx @@ -34,10 +34,10 @@ import datetime ######################### -# ZimBlob # +# Blob # ######################### -cdef class ZimBlob: +cdef class Blob: cdef clibzim.Blob* c_blob cdef bytes ref_content @@ -53,10 +53,6 @@ cdef class ZimBlob: del self.c_blob -######################### -# ZimArticle # -######################### - #------ Helper for pure virtual methods -------- cdef get_article_method_from_object(object obj, string method, int *error) with gil: @@ -80,7 +76,7 @@ cdef public api: clibzim.Blob blob_cy_call_fct(object obj, string method, int *error) with gil: """Lookup and execute a pure virtual method on ZimArticle returning a Blob""" - cdef ZimBlob blob + cdef Blob blob func = get_article_method_from_object(obj, method, error) blob = func() @@ -96,7 +92,7 @@ cdef public api: func = get_article_method_from_object(obj, method, error) return func() -cdef class ZimCreator: +cdef class Creator: """ A class to represent a Zim Creator. diff --git a/libzim/writer.py b/libzim/writer.py new file mode 100644 index 00000000..a365e9f1 --- /dev/null +++ b/libzim/writer.py @@ -0,0 +1,190 @@ +# This file is part of python-libzim +# (see https://github.com/libzim/python-libzim) +# +# Copyright (c) 2020 Juan Diego Caballero +# Copyright (c) 2020 Matthieu Gautier +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import datetime +from collections import defaultdict + +import libzim_wrapper +from libzim_wrapper import Blob + +__all__ = ["Article", "Blob", "Creator"] + + +class Article: + def __init__(self): + self._blob = None + + def get_url(self): + raise NotImplementedError + + def get_title(self): + raise NotImplementedError + + def is_redirect(self): + raise NotImplementedError + + def get_mime_type(self): + raise NotImplementedError + + def get_filename(self): + raise NotImplementedError + + def should_compress(self): + raise NotImplementedError + + def should_index(self): + raise NotImplementedError + + def redirect_url(self): + raise NotImplementedError + + def _get_data(self): + if self._blob is None: + self._blob = self.get_data() + return self._blob + + def get_data(self): + raise NotImplementedError + + +class MetadataArticle(Article): + def __init__(self, url, metadata_content): + Article.__init__(self) + self.url = url + self.metadata_content = metadata_content + + def is_redirect(self): + return False + + def get_url(self): + return f"M/{self.url}" + + def get_title(self): + return "" + + def get_mime_type(self): + return "text/plain" + + def get_filename(self): + return "" + + def should_compress(self): + return True + + def should_index(self): + return False + + def get_data(self): + return Blob(self.metadata_content) + + +MANDATORY_METADATA_KEYS = [ + "Name", + "Title", + "Creator", + "Publisher", + "Date", + "Description", + "Language", +] + + +class Creator: + """ + A class to represent a Zim Creator. + + Attributes + ---------- + *c_creator : zim.Creator + a pointer to the C++ Creator object + _finalized : bool + flag if the creator was finalized + _filename : str + Zim file path + _main_page : str + Zim file main page + _index_language : str + Zim file Index language + _min_chunk_size : str + Zim file minimum chunk size + _article_counter + Zim file article counter + _metadata + Zim file metadata + """ + + def __init__(self, filename, main_page, index_language, min_chunk_size): + print(filename) + self._creatorWrapper = libzim_wrapper.Creator( + filename, main_page, index_language, min_chunk_size + ) + self.filename = filename + self.main_page = main_page + self.language = index_language + self._metadata = {} + self._article_counter = defaultdict(int) + self.update_metadata(date=datetime.date.today(), language=index_language) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def add_article(self, article): + self._creatorWrapper.add_article(article) + if not article.is_redirect(): + self._update_article_counter(article) + + def _update_article_counter(self, article): + # default dict update + self._article_counter[article.get_mime_type().strip()] += 1 + + def mandatory_metadata_ok(self): + """Flag if mandatory metadata is complete and not empty""" + metadata_item_ok = [k in self._metadata for k in MANDATORY_METADATA_KEYS] + return all(metadata_item_ok) + + def update_metadata(self, **kwargs): + "Updates article metadata" "" + # Converts python case to pascal case. example: long_description-> LongDescription + pascalize = lambda keyword: "".join(keyword.title().split("_")) + new_metadata = {pascalize(k): v for k, v in kwargs.items()} + self._metadata.update(new_metadata) + + def write_metadata(self): + for key, value in self._metadata.items(): + if key == "date" and isinstance(value, datetime.date): + value = value.strftime("%Y-%m-%d") + article = MetadataArticle(key, value) + self._creatorWrapper.add_article(article) + + article = MetadataArticle("Counter", self._get_counter_string()) + self._creatorWrapper.add_article(article) + + def _get_counter_string(self): + return ";".join(["%s=%s" % (k, v) for (k, v) in self._article_counter.items()]) + + def close(self): + self.write_metadata() + self._creatorWrapper.finalize() + + def __repr__(self): + return f"Creator(filename={self.filename})" diff --git a/tests/test_libzim.py b/tests/test_libzim.py index 73a9c577..011d8f55 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -18,7 +18,7 @@ import pytest -from libzim import ZimArticle, ZimBlob, ZimCreator +from libzim.writer import Article, Blob, Creator # test files https://wiki.kiwix.org/wiki/Content_in_all_languages @@ -60,9 +60,9 @@ def article_content(): return (content, url, title, mime_type) -class ZimTestArticle(ZimArticle): +class SimpleArticle(Article): def __init__(self, content, url, title, mime_type): - ZimArticle.__init__(self) + Article.__init__(self) self.content = content self.url = url self.title = title @@ -94,16 +94,16 @@ def should_index(self): return True def get_data(self): - return ZimBlob(self.content.encode("UTF-8")) + return Blob(self.content.encode("UTF-8")) @pytest.fixture(scope="session") def article(article_content): - return ZimTestArticle(*article_content) + return SimpleArticle(*article_content) def test_write_article(tmpdir, article): - with ZimCreator( + with Creator( str(tmpdir / "test.zim"), main_page="welcome", index_language="eng", @@ -120,7 +120,7 @@ def test_write_article(tmpdir, article): def test_article_metadata(tmpdir, metadata): - with ZimCreator( + with Creator( str(tmpdir / "test.zim"), main_page="welcome", index_language="eng", @@ -131,7 +131,7 @@ def test_article_metadata(tmpdir, metadata): def test_check_mandatory_metadata(tmpdir): - with ZimCreator( + with Creator( str(tmpdir / "test.zim"), main_page="welcome", index_language="eng",