diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..993c8aea --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,73 @@ +name: release +on: + release: + types: [published] + tags: + - v* + +env: + LIBZIM_RELEASE: libzim_linux-x86_64-6.1.1 + LIBZIM_LIBRARY_PATH: lib/x86_64-linux-gnu/libzim.so.6.1.1 + LIBZIM_INCLUDE_PATH: include/zim + CYTHON_VERSION: 0.29.6 + +jobs: + release: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + # TODO: expand this to cross-platform builds (see V2 approach below) + # os: [ubuntu-latest, windows-latest, macos-latest] + python-version: [3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + + - name: Cache libzim dylib & headers + uses: actions/cache@master + id: cache-libzim + with: + path: libzim_linux + key: ${{ env.LIBZIM_RELEASE }}-libzim-cache + + - name: Download libzim dylib & headers from OpenZIM.org releases + if: steps.cache-libzim.outputs.cache-hit != 'true' + run: | + wget -q https://download.openzim.org/release/libzim/$LIBZIM_RELEASE.tar.gz + tar --gunzip --extract --file=$LIBZIM_RELEASE.tar.gz + mv $LIBZIM_RELEASE libzim_linux + + - name: Link libzim dylib & headers into workspace lib and include folders + run: | + cp -p $GITHUB_WORKSPACE/libzim_linux/$LIBZIM_LIBRARY_PATH lib/libzim.so + cp -p $GITHUB_WORKSPACE/libzim_linux/$LIBZIM_LIBRARY_PATH lib/ + sudo ldconfig $GITHUB_WORKSPACE/lib + ln -s $GITHUB_WORKSPACE/libzim_linux/$LIBZIM_INCLUDE_PATH include/zim + + - name: Build cython, sdist, and bdist_wheels + run: | + pip install --upgrade cython==$CYTHON_VERSION setuptools pip + python3 setup.py build_ext + python3 setup.py sdist bdist_wheel + python -m cibuildwheel --output-dir wheelhouse + + - uses: actions/upload-artifact@v1 + with: + name: wheels + path: ./wheelhouse + + - name: Push release to PyPI + if: github.event_name == 'push' && github.ref == 'refs/heads/master' + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + # TODO: remove this line to upload to the real PyPI when ready + repository_url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..efc8f639 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,88 @@ +name: test +on: [push] + +env: + LIBZIM_RELEASE: libzim_linux-x86_64-6.1.1 + LIBZIM_LIBRARY_PATH: lib/x86_64-linux-gnu/libzim.so.6.1.1 + LIBZIM_INCLUDE_PATH: include/zim + CYTHON_VERSION: 0.29.6 + MAX_LINE_LENGTH: 110 + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v1 + with: + python-version: 3.6 + architecture: x64 + + - name: Autoformat with black + run: | + pip install black + black --check --exclude=setup.py . + + - name: Lint with flake8 + run: | + pip install flake8 + # one pass for show-stopper syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --exclude=setup.py --show-source --statistics + # one pass for small stylistic things + flake8 . --count --exclude=setup.py --max-line-length=$MAX_LINE_LENGTH --statistics + + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + # TODO: expand this once macos and windows libzim releases become available + # os: [ubuntu-latest, windows-latest, macos-latest] + # alternatively we can compile libzim in docker and use the container as an action + python: [3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python }} + architecture: x64 + + - name: Cache libzim dylib & headers + uses: actions/cache@master + id: cache-libzim + with: + path: libzim_linux + key: ${{ env.LIBZIM_RELEASE }}-libzim-cache + + - name: Download libzim dylib & headers from OpenZIM.org releases + if: steps.cache-libzim.outputs.cache-hit != 'true' + run: | + wget -q https://download.openzim.org/release/libzim/$LIBZIM_RELEASE.tar.gz + tar --gunzip --extract --file=$LIBZIM_RELEASE.tar.gz + mv $LIBZIM_RELEASE libzim_linux + + - name: Link libzim dylib & headers into workspace lib and include folders + run: | + cp -p $GITHUB_WORKSPACE/libzim_linux/$LIBZIM_LIBRARY_PATH lib/libzim.so + cp -p $GITHUB_WORKSPACE/libzim_linux/$LIBZIM_LIBRARY_PATH lib/ + sudo ldconfig $GITHUB_WORKSPACE/lib + ln -s $GITHUB_WORKSPACE/libzim_linux/$LIBZIM_INCLUDE_PATH include/zim + + - name: Build cython, sdist, and bdist_wheel + run: | + pip install --upgrade cython==$CYTHON_VERSION setuptools pip wheel + python3 setup.py build_ext + python3 setup.py sdist bdist_wheel + + - name: Test built package with pytest + run: | + export LD_LIBRARY_PATH=$PWD/lib:$LD_LIBRARY_PATH + sudo ldconfig + pip install pytest + pip install -e . + pytest . diff --git a/.gitignore b/.gitignore index 020ff6f4..c1568563 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,25 @@ +# General cruft +.DS_Store +.venv +.venv-docker +.mypy_cache __pycache__ -build +*.pyc +.tox + +# Compiled binaries and package builds +*.so +build/ +dist/ +*.egg-info/ + +# Dylibs and headers +lib/* +include/* +libzim_linux-*/ + +# Autogenerated files libzim_wrapper.*.so libzim/libzim_wrapper.cpp libzim/libzim_wrapper.h libzim/libzim_wrapper_api.h -*.egg-info diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..066d1e08 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,8 @@ +include LICENSE +include README.md +include tests/*.py +include pyproject.toml + +recursive-include lib * +recursive-include include * +recursive-include libzim * diff --git a/Pipfile b/Pipfile new file mode 100644 index 00000000..fc479fd6 --- /dev/null +++ b/Pipfile @@ -0,0 +1,11 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] +pytest = "*" +cython = "==0.29.6" +e1839a8 = {editable = true, path = "."} + +[packages] diff --git a/libzim/examples.py b/examples/basic.py similarity index 100% rename from libzim/examples.py rename to examples/basic.py diff --git a/include/README.md b/include/README.md new file mode 100644 index 00000000..4d585623 --- /dev/null +++ b/include/README.md @@ -0,0 +1 @@ +Put your zim/*.h folder in here. diff --git a/lib/README.md b/lib/README.md new file mode 100644 index 00000000..a33fa528 --- /dev/null +++ b/lib/README.md @@ -0,0 +1 @@ +Put your libzim.so file in here. diff --git a/libzim/reader.py b/libzim/reader.py index 345f3b7d..645dcb92 100644 --- a/libzim/reader.py +++ b/libzim/reader.py @@ -1,2 +1,4 @@ +# flake8: noqa + from libzim_wrapper import File from libzim_wrapper import ReadArticle as Article diff --git a/libzim/writer.py b/libzim/writer.py index 65a3633a..af8f2fbc 100644 --- a/libzim/writer.py +++ b/libzim/writer.py @@ -106,6 +106,11 @@ def get_data(self): ] +def pascalize(keyword): + """ Converts python case to pascal case. example: long_description-> LongDescription """ + return "".join(keyword.title().split("_")) + + class Creator: """ A class to represent a Zim Creator. @@ -132,9 +137,7 @@ class Creator: def __init__(self, filename, main_page, index_language, min_chunk_size): print(filename) - self._creatorWrapper = libzim_wrapper.Creator( - filename, main_page, index_language, min_chunk_size - ) + self._creatorWrapper = libzim_wrapper.Creator(filename, main_page, index_language, min_chunk_size) self.filename = filename self.main_page = main_page self.language = index_language @@ -164,8 +167,6 @@ def mandatory_metadata_ok(self): def update_metadata(self, **kwargs): "Updates article metadata" "" - # Converts python case to pascal case. example: long_description-> LongDescription - pascalize = lambda keyword: "".join(keyword.title().split("_")) new_metadata = {pascalize(k): v for k, v in kwargs.items()} self._metadata.update(new_metadata) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..fbc2f891 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[build-system] +requires = [ "setuptools >= 35.0.2", "wheel >= 0.29.0", "twine", "cython == 0.29.6" ] +build-backend = "setuptools.build_meta" + +[tool.black] +line-length = 110 +target-version = ['py36', 'py37', 'py38'] diff --git a/setup.py b/setup.py index 8a05d11c..736e3054 100755 --- a/setup.py +++ b/setup.py @@ -1,30 +1,149 @@ -import os -from distutils.core import setup -from distutils.extension import Extension +#!/usr/bin/env python3 +""" +python-libzim (the openzim/libzim bindings for Python) + +The project is compiled in two steps: + + 1. Cython: compile the cython format files (.pyx, .pyd) to C++ (.cpp and .h) + 2. Cythonize: compile the generated C++ to a python-importable binary extension .so + +The Cython and Cythonize compilation is done automatically during packaging with setup.py: + + $ python3 setup.py build_ext + $ python3 setup.py sdist bdist_wheel + + +To compile or run this project, you must first get the libzim headers & binary: + + - You can get the headers here and build and install the binary from source: + https://github.com/openzim/libzim + + - Or you can download a full prebuilt release (if one exists for your platform): + https://download.openzim.org/release/libzim/ + +Either place the `libzim.so` and `zim/*.h` files in `./lib/` and `./include/`, + or set these environment variables to use custom libzim header and dylib paths: + + $ export CFLAGS="-I/tmp/libzim_linux-x86_64-6.1.1/include" + $ export LDFLAGS="-L/tmp/libzim_linux-x86_64-6.1.1/lib/x86_64-linux-gnu" + $ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/tmp/libzim_linux-x86_64-6.1.1/lib/x86_64-linux-gnu" +""" +from pathlib import Path +from ctypes.util import find_library + +from setuptools import setup, Extension from Cython.Build import cythonize -# Utility function to read the README file. -# Used for the long_description. It's nice, because now 1) we have a top levegit checkout masterl -# README file and 2) it's easier to type in the README file than to put a raw -# string in below ... -def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() + +PACKAGE_NAME = "libzim_wrapper" +VERSION = "0.0.1" # pegged to be the same version as libzim since they are always released together +LICENSE = "GPLv3+" +DESCRIPTION = "A python-facing API for creating and interacting with ZIM files" +AUTHOR = "Monadical Inc." +AUTHOR_EMAIL = "jdc@monadical.com" +GITHUB_URL = "https://github.com/openzim/python-libzim" + +BASE_DIR = Path(__file__).parent +BINDINGS_CYTHON_DIR = 'libzim' # the cython binding source dir (containing .pyx, .pyd, etc.) +LIBZIM_INCLUDE_DIR = 'include' # the libzim C++ header src dir (containing zim/*.h) +LIBZIM_LIBRARY_DIR = 'lib' # the libzim .so binary lib dir (containing libzim.so) + + +# Check for the CPP Libzim library headers in expected directory +if not (BASE_DIR / LIBZIM_INCLUDE_DIR / 'zim/zim.h').exists(): + print( + f"[!] Warning: Couldn't find zim/*.h in ./{LIBZIM_INCLUDE_DIR}!\n" + f" Hint: You can install them from source from https://github.com/openzim/libzim\n" + f" or download a prebuilt release's headers into ./include/zim/*.h\n" + f" (or set CFLAGS='-I/tmp/libzim_linux-x86_64-{VERSION}/include')" + ) + +# Check for the CPP Libzim shared library in expected directory or system paths +if not ((BASE_DIR / LIBZIM_LIBRARY_DIR / 'libzim.so').exists() or find_library('zim')): + print( + f"[!] Warning: Couldn't find libzim.so in ./{LIBZIM_LIBRARY_DIR} or system library paths!" + f" Hint: You can install it from source from https://github.com/openzim/libzim\n" + f" or download a prebuilt zimlib.so release into ./lib.\n" + f" (or set LDFLAGS='-L/tmp/libzim_linux-x86_64-{VERSION}/lib/x86_64-linux-gnu')" + ) setup( - name = "python-libzim", - version = "0.0.1", - author = "Monadical SAS", - author_email = "hello@monadical.com", - description = ("A python-facing API for creating and interacting with ZIM files"), - license = "GPLv3+", - long_description=read('README.md'), - ext_modules = cythonize([ - Extension("libzim_wrapper", ["libzim/*.pyx", "libzim/lib.cxx"], - include_dirs=["libzim"], - libraries=["zim"], - extra_compile_args=["-std=c++11"], - language="c++"), + name=PACKAGE_NAME, + version=VERSION, + url=GITHUB_URL, + project_urls={ + 'Source': GITHUB_URL, + 'Bug Tracker': f'{GITHUB_URL}/issues', + 'Changelog': f'{GITHUB_URL}/releases', + 'Documentation': f'{GITHUB_URL}/blob/master/README.md', + 'Donate': 'https://www.kiwix.org/en/support-us/', + }, + author=AUTHOR, + author_email=AUTHOR_EMAIL, + license=LICENSE, + description=DESCRIPTION, + long_description=(BASE_DIR / 'README.md').read_text(), + long_description_content_type="text/markdown", + python_requires='>=3.6', + include_package_data=True, + ext_modules=cythonize( + [ + Extension( + "libzim_wrapper", + sources=[ + f"{BINDINGS_CYTHON_DIR}/*.pyx", + f"{BINDINGS_CYTHON_DIR}/lib.cxx", + ], + include_dirs=[ + BINDINGS_CYTHON_DIR, + LIBZIM_INCLUDE_DIR, + ], + libraries=[ + 'zim', + ], + library_dirs=[ + LIBZIM_LIBRARY_DIR, + ], + extra_compile_args=[ + "-std=c++11", + "-Wall", + "-Wextra", + ], + language="c++", + ) ], - compiler_directives={'language_level' : "3"} + compiler_directives={"language_level" : "3"}, ), + zip_safe=False, + classifiers=[ + "Development Status :: 3 - Alpha", + + "Topic :: Utilities", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: System :: Archiving", + "Topic :: System :: Archiving :: Compression", + "Topic :: System :: Archiving :: Mirroring", + "Topic :: System :: Archiving :: Backup", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", + "Topic :: Sociology :: History", + + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: End Users/Desktop", + "Intended Audience :: Information Technology", + "Intended Audience :: System Administrators", + + "Programming Language :: Cython", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + # "Typing :: Typed", + + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Natural Language :: English", + "Operating System :: OS Independent", + ], ) diff --git a/tests/test_libzim.py b/tests/test_libzim.py index 011d8f55..eb47d107 100644 --- a/tests/test_libzim.py +++ b/tests/test_libzim.py @@ -36,7 +36,10 @@ def metadata(): "Description": "All articles (without images) from the english Wikipedia", "Language": "eng", # Optional - "Longdescription": "This ZIM file contains all articles (without images) from the english Wikipedia by 2009-11-10. The topics are ...", + "Longdescription": ( + "This ZIM file contains all articles (without images) from the english Wikipedia by 2009-11-10." + " The topics are ..." + ), "Licence": "CC-BY", "Tags": "wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:yes;_ftindex:yes", "Flavour": "nopic", @@ -104,10 +107,7 @@ def article(article_content): def test_write_article(tmpdir, article): with Creator( - str(tmpdir / "test.zim"), - main_page="welcome", - index_language="eng", - min_chunk_size=2048, + str(tmpdir / "test.zim"), main_page="welcome", index_language="eng", min_chunk_size=2048, ) as zim_creator: zim_creator.add_article(article) zim_creator.update_metadata( @@ -121,10 +121,7 @@ def test_write_article(tmpdir, article): def test_article_metadata(tmpdir, metadata): with Creator( - str(tmpdir / "test.zim"), - main_page="welcome", - index_language="eng", - min_chunk_size=2048, + str(tmpdir / "test.zim"), main_page="welcome", index_language="eng", min_chunk_size=2048, ) as zim_creator: zim_creator.update_metadata(**metadata) assert zim_creator._metadata == metadata @@ -132,10 +129,7 @@ def test_article_metadata(tmpdir, metadata): def test_check_mandatory_metadata(tmpdir): with Creator( - str(tmpdir / "test.zim"), - main_page="welcome", - index_language="eng", - min_chunk_size=2048, + str(tmpdir / "test.zim"), main_page="welcome", index_language="eng", min_chunk_size=2048, ) as zim_creator: assert not zim_creator.mandatory_metadata_ok() zim_creator.update_metadata( diff --git a/tests/test_libzim_file_reader.py b/tests/test_libzim_file_reader.py index fefc1b1b..eb376e89 100644 --- a/tests/test_libzim_file_reader.py +++ b/tests/test_libzim_file_reader.py @@ -10,34 +10,33 @@ ZIMFILES = [ { - 'filename': str(DATA_DIR/"wikipedia_es_physics_mini.zim"), - 'checksum': u"99ea7a5598c6040c4f50b8ac0653b703", - 'namespaces': u"-AIMX", - 'article_count': 22027, - 'main_page_url': u"A/index", + "filename": str(DATA_DIR / "wikipedia_es_physics_mini.zim"), + "checksum": "99ea7a5598c6040c4f50b8ac0653b703", + "namespaces": "-AIMX", + "article_count": 22027, + "main_page_url": "A/index", } ] - - @pytest.fixture(params=ZIMFILES) def zimdata(request): return request.param + @pytest.fixture def reader(zimdata): - return File(zimdata['filename']) + return File(zimdata["filename"]) @pytest.fixture def article_data(): return { - 'url': u"A/Albert_Einstein", - 'title': u"Albert Einstein", - 'mimetype':u"text/html", - 'article_id': 663, - 'size': 17343 + "url": "A/Albert_Einstein", + "title": "Albert Einstein", + "mimetype": "text/html", + "article_id": 663, + "size": 17343, } @@ -45,57 +44,68 @@ def test_zim_filename(reader, zimdata): for k, v in zimdata.items(): assert getattr(reader, k) == v + def test_zim_read(reader, article_data): - article = reader.get_article(article_data['url']) + article = reader.get_article(article_data["url"]) - assert article.longurl == article_data['url'] - assert article.title == article_data['title'] - assert article.url == article_data['url'][2:] - assert article.mimetype == article_data['mimetype'] + assert article.longurl == article_data["url"] + assert article.title == article_data["title"] + assert article.url == article_data["url"][2:] + assert article.mimetype == article_data["mimetype"] assert isinstance(article.content, memoryview) - assert len(article.content) == article_data['size'] + assert len(article.content) == article_data["size"] + def test_content_ref_keep(reader): """Get the memoryview on a content and loose the reference on the article. We try to load a lot of other articles to detect possible use of dandling pointer """ - content =None + content = None + def get_content(): nonlocal content - article = reader.get_article(u"A/Albert_Einstein") + article = reader.get_article("A/Albert_Einstein") assert isinstance(article.content, memoryview) content = article.content - get_content() # Now we have a content but no reference to the article. + + get_content() # Now we have a content but no reference to the article. gc.collect() # Load a lot of content for i in range(0, reader.article_count, 2): article = reader.get_article_by_id(i) if not article.is_redirect: - c = article.content + _ = article.content # Check everything is ok assert len(content) == 17343 - assert bytes(content[:100]) == b'\n\n \n Albert Einstein</ti' + assert ( + bytes(content[:100]) + == b'<!DOCTYPE html>\n<html class="client-js"><head>\n <meta charset="UTF-8">\n <title>Albert Einstein</ti' # noqa + ) + def test_get_article_by_id(reader, article_data): return - article = reader.get_article_by_id(article_data['article_id']) + article = reader.get_article_by_id(article_data["article_id"]) + + assert article.longurl == article_data["url"] + assert article.title == article_data["title"] + assert article.url == article_data["url"][2:] + assert article.mimetype == article_data["mimetype"] - assert article.longurl == article_data['url'] - assert article.title == article_data['title'] - assert article.url == article_data['url'][2:] - assert article.mimetype == article_data['mimetype'] def test_namespace_count(reader): namespaces = reader.namespaces num_articles = sum(reader.get_namespaces_count(ns) for ns in namespaces) assert reader.article_count == num_articles + def test_suggest(reader): - results = reader.suggest(u"Einstein") - assert u"A/Albert_Einstein" in list(results) + results = reader.suggest("Einstein") + assert "A/Albert_Einstein" in list(results) + def test_search(reader): - results = reader.search(u"Einstein") + results = reader.search("Einstein") assert len(list(results)) == 10