diff --git a/.github/workflows/Publish.yaml b/.github/workflows/Publish.yaml new file mode 100644 index 0000000..76961dd --- /dev/null +++ b/.github/workflows/Publish.yaml @@ -0,0 +1,42 @@ +name: Build and upload to PyPI + +on: + release: + types: [published] + +jobs: + publish: + runs-on: ubuntu-22.04 + permissions: + id-token: write # mandatory for PyPI trusted publishing + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Build packages + run: | + pip install -U pip build + python -m build --sdist --wheel + + - name: Upload to PyPI + uses: pypa/gh-action-pypi-publish@release/v1.8 + + - name: Build and push Docker image + uses: openzim/docker-publish-action@v10 + with: + image-name: openzim/gutenberg + tag-pattern: /^v([0-9.]+)$/ + latest-on-tag: true + restrict-to: openzim/gutenberg + registries: ghcr.io + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + repo_description: auto + repo_overview: auto diff --git a/.github/workflows/docker.yml b/.github/workflows/PublishDockerDevImage.yaml similarity index 67% rename from .github/workflows/docker.yml rename to .github/workflows/PublishDockerDevImage.yaml index 753d639..5eab5ff 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -1,4 +1,4 @@ -name: Docker +name: Publish Docker dev image on: push: @@ -6,18 +6,18 @@ on: - main jobs: - build-and-push: - name: Deploy Docker Image + publish: runs-on: ubuntu-22.04 + steps: - - uses: actions/checkout@v3.4.0 - - name: Build and push + - uses: actions/checkout@v3 + + - name: Build and push Docker image uses: openzim/docker-publish-action@v10 with: image-name: openzim/gutenberg - on-master: dev - tag-pattern: /^v([0-9.]+)$/ - latest-on-tag: true + manual-tag: dev + latest-on-tag: false restrict-to: openzim/gutenberg registries: ghcr.io credentials: diff --git a/.github/workflows/QA.yaml b/.github/workflows/QA.yaml new file mode 100644 index 0000000..48ccee5 --- /dev/null +++ b/.github/workflows/QA.yaml @@ -0,0 +1,34 @@ +name: QA + +on: + pull_request: + push: + branches: + - main + +jobs: + check-qa: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[lint,scripts,test,check] + + - name: Check black formatting + run: inv lint-black + + - name: Check ruff + run: inv lint-ruff + + - name: Check pyright + run: inv check-pyright diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml new file mode 100644 index 0000000..b1979c7 --- /dev/null +++ b/.github/workflows/Tests.yaml @@ -0,0 +1,61 @@ +name: Tests + +on: + pull_request: + push: + branches: + - main + +jobs: + run-tests: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[test,scripts] + + - name: Run the tests + run: inv coverage --args "-vvv" + + - name: Upload coverage report to codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + + build_python: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Ensure we can build Python targets + run: | + pip install -U pip build + python3 -m build --sdist --wheel + + build_docker: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Ensure we can build the Docker image + run: | + docker build -t gutenberg . + + - name: Ensure we can start the Docker image + run: | + docker run --rm gutenberg diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 50d6d82..0000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: release -on: - release: - types: [published] - tags: - - v* - -env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - -jobs: - release: - environment: release - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3.3.0 - - name: Set up Python - uses: actions/setup-python@v4.5.0 - with: - python-version: "3.11" - architecture: x64 - - - name: Build sdist and wheel - run: | - pip install --upgrade setuptools pip wheel build - pip install -r requirements.pip - python3 -m build --no-isolation - - - name: Push release to PyPI - if: github.event_name == 'release' - run: | - pip install --upgrade twine - twine check dist/* - twine upload dist/* - - - name: Build and push - uses: openzim/docker-publish-action@v10 - with: - image-name: openzim/gutenberg - tag-pattern: /^v([0-9.]+)$/ - latest-on-tag: true - restrict-to: openzim/gutenberg - registries: ghcr.io - credentials: - GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} - GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} - repo_description: auto - repo_overview: auto diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3b354ac --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer +- repo: https://github.com/psf/black + rev: "23.3.0" + hooks: + - id: black +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.0.272 + hooks: + - id: ruff +- repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.323 + hooks: + - id: pyright + name: pyright (system) + description: 'pyright static type checker' + entry: pyright + language: system + 'types_or': [python, pyi] + require_serial: true + minimum_pre_commit_version: '2.9.2' diff --git a/Dockerfile b/Dockerfile index 4d2b0b2..07e8a97 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,29 +1,40 @@ FROM python:3.11.4-bookworm # Install necessary packages -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends advancecomp libxml2-dev libxslt-dev python3-pillow rsync libjpeg-dev libpng-dev libmagic1 locales jpegoptim pngquant gifsicle && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + advancecomp \ + libxml2-dev \ + libxslt-dev \ + python3-pillow \ + rsync \ + libjpeg-dev \ + libpng-dev \ + libmagic1 \ + locales \ + jpegoptim \ + pngquant \ + gifsicle \ + && rm -rf /var/lib/apt/lists/* \ + && python -m pip install --no-cache-dir -U \ + pip \ + && sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen \ + && locale-gen "en_US.UTF-8" -# Install gutenberg (from source) -RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen "en_US.UTF-8" -COPY requirements.pip /src/ -RUN python3 -m pip install -r /src/requirements.pip -COPY LICENSE /src/ -COPY pypi-readme.rst /src/ -COPY MANIFEST.in /src/ -COPY setup.py /src/ -COPY get_js_deps.sh /src/ -COPY gutenberg2zim /src/ -COPY gutenbergtozim /src/gutenbergtozim -WORKDIR /src/ -RUN python3 ./setup.py install +# Copy code + associated artifacts +COPY src /src/src +COPY pyproject.toml *.md *.rst get_js_deps.sh LICENSE *.py /src/ -# Boot commands +# Install + cleanup +RUN pip install --no-cache-dir /src \ + && rm -rf /src + +# default output directory +RUN mkdir -p /output WORKDIR /output -ENV LANG=en_US.UTF-8 -ENV LANGUAGE=en_US:en -ENV LC_ALL=en_US.UTF-8 -CMD gutenberg2zim --help ; /bin/bash +ENV LANG=en_US.UTF-8 \ + LANGUAGE=en_US:en \ + LC_ALL=en_US.UTF-8 + +CMD ["gutenberg2zim", "--help"] \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 837f0c5..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include LICENSE -include pypi-readme.rst -include requirements.pip -recursive-include gutenbergtozim/templates * diff --git a/README.md b/README.md index d7f69d4..cd642be 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ storing content for offline usage. [![Docker](https://ghcr-badge.deta.dev/openzim/gutenberg/latest_tag?label=docker)](https://ghcr.io/openzim/gutenberg) [![CodeFactor](https://www.codefactor.io/repository/github/openzim/gutenberg/badge)](https://www.codefactor.io/repository/github/openzim/gutenberg) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) +[![codecov](https://codecov.io/gh/openzim/gutenberg/branch/main/graph/badge.svg)](https://codecov.io/gh/openzim/gutenberg) ## Coding guidelines Main coding guidelines comes from the [openZIM Wiki](https://github.com/openzim/overview/wiki) diff --git a/get_js_deps.sh b/get_js_deps.sh index 7a5ecab..2db5f12 100755 --- a/get_js_deps.sh +++ b/get_js_deps.sh @@ -14,7 +14,7 @@ fi # Absolute path this script is in. SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )" -ASSETS_PATH="${SCRIPT_PATH}/gutenbergtozim/templates" +ASSETS_PATH="${SCRIPT_PATH}/src/gutenberg2zim/templates" echo "About to download JS assets to ${ASSETS_PATH}" diff --git a/gutenbergtozim/__init__.py b/gutenbergtozim/__init__.py deleted file mode 100644 index bb04394..0000000 --- a/gutenbergtozim/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - -import logging -import threading -from pathlib import Path as path - -from zimscraperlib.logging import getLogger - -logger = getLogger(__name__, level=logging.DEBUG) - -TMP_FOLDER = "tmp" -TMP_FOLDER_PATH = path(TMP_FOLDER) - -VERSION = "2.0.0" - -lock = threading.Lock() - -creator = None diff --git a/hatch_build.py b/hatch_build.py new file mode 100644 index 0000000..7b69608 --- /dev/null +++ b/hatch_build.py @@ -0,0 +1,37 @@ +import logging +import subprocess +from pathlib import Path + +from hatchling.builders.hooks.plugin.interface import BuildHookInterface + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +# update list in constants.py as well +JS_DEPS = [ + "datatables/datatables.min.css", + "datatables/datatables.min.js", +] + + +class GetJsDepsHook(BuildHookInterface): + def initialize(self, version, build_data): + if self.deps_already_installed(): + logger.info("JS dependencies are already installed, skipping it") + return + subprocess.run( + str(Path(self.root).joinpath("get_js_deps.sh")), + check=True, + ) + return super().initialize(version, build_data) + + def deps_already_installed(self) -> bool: + for dep in JS_DEPS: + if ( + not Path(self.root) + .joinpath("gutebergtozim/templates") + .joinpath(dep) + .exists() + ): + return False + return True diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6cb351a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,232 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "gutenberg2zim" +authors = [{ name = "Kiwix", email = "dev@kiwix.org" }] +keywords = ["kiwix", "zim", "offline", "gutenberg"] +requires-python = ">=3.11" +description = "Make ZIM file from Gutenberg books" +readme = "pypi-readme.rst" +license = { text = "GPL-3.0-or-later" } +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", +] +dependencies = [ + "beautifulsoup4==4.9.3", + "Jinja2==3.1.2", + "peewee==3.15.4", + "path.py==12.5.0", + "Babel==2.11.0", + "lxml==4.9.2", + "docopt==0.6.2", + "chardet==5.1.0", + "apsw==3.40.0.0", + "kiwixstorage==0.8.3", + "requests==2.31.0", + "pif==0.8.2", + "zimscraperlib==2.1.0", + "schedule==1.1.0", + "backoff==2.2.1", +] +dynamic = ["version"] + +[project.optional-dependencies] +scripts = ["invoke==2.2.0"] +lint = ["black==23.7.0", "ruff==0.0.280"] +check = ["pyright==1.1.323"] +test = ["pytest==7.4.0", "coverage==7.3.0"] +dev = [ + "pre-commit==3.3.3", + "debugpy==1.6.7", + "ipdb==0.13.13", + "ipython==8.14.0", + "gutenberg2zim[scripts]", + "gutenberg2zim[lint]", + "gutenberg2zim[test]", + "gutenberg2zim[check]", + # hatchling is a dev dependency only needed for hook development on developer machine + "hatchling==1.18.0", +] + +[project.urls] +Homepage = "https://github.com/openzim/kolibri" +Donate = "https://www.kiwix.org/en/support-us/" + +[project.scripts] +gutenberg2zim = "gutenberg2zim:entrypoint.main" + +[tool.hatch.version] +path = "src/gutenberg2zim/__about__.py" + +[tool.hatch.build] +exclude = ["/.github"] + +[tool.hatch.build.hooks.custom] +path = "hatch_build.py" +dependencies = ["zimscraperlib==3.1.1"] + +[tool.hatch.envs.default] +features = ["dev"] + +[tool.hatch.envs.test] +features = ["scripts", "test"] + +[tool.hatch.envs.test.scripts] +run = "inv test --args '{args}'" +run-cov = "inv test-cov --args '{args}'" +report-cov = "inv report-cov" +coverage = "inv coverage --args '{args}'" +html = "inv coverage --html --args '{args}'" + +[tool.hatch.envs.lint] +template = "lint" +python = "py311" +skip-install = false +features = ["scripts", "lint"] + +[tool.hatch.envs.lint.scripts] +black = "inv lint-black --args '{args}'" +ruff = "inv lint-ruff --args '{args}'" +all = "inv lintall --args '{args}'" +fix-black = "inv fix-black --args '{args}'" +fix-ruff = "inv fix-ruff --args '{args}'" +fixall = "inv fixall --args '{args}'" + +[tool.hatch.envs.check] +features = ["scripts", "check"] + +[tool.hatch.envs.check.scripts] +pyright = "inv check-pyright --args '{args}'" +all = "inv checkall --args '{args}'" + +[tool.black] +line-length = 88 +target-version = ['py311'] + +[tool.ruff] +target-version = "py311" +line-length = 88 +src = ["src"] +select = [ + "A", # flake8-builtins + # "ANN", # flake8-annotations + "ARG", # flake8-unused-arguments + # "ASYNC", # flake8-async + "B", # flake8-bugbear + # "BLE", # flake8-blind-except + "C4", # flake8-comprehensions + "C90", # mccabe + # "COM", # flake8-commas + # "D", # pydocstyle + # "DJ", # flake8-django + "DTZ", # flake8-datetimez + "E", # pycodestyle (default) + "EM", # flake8-errmsg + # "ERA", # eradicate + # "EXE", # flake8-executable + "F", # Pyflakes (default) + # "FA", # flake8-future-annotations + "FBT", # flake8-boolean-trap + # "FLY", # flynt + # "G", # flake8-logging-format + "I", # isort + "ICN", # flake8-import-conventions + # "INP", # flake8-no-pep420 + # "INT", # flake8-gettext + "ISC", # flake8-implicit-str-concat + "N", # pep8-naming + # "NPY", # NumPy-specific rules + # "PD", # pandas-vet + # "PGH", # pygrep-hooks + # "PIE", # flake8-pie + # "PL", # Pylint + "PLC", # Pylint: Convention + "PLE", # Pylint: Error + "PLR", # Pylint: Refactor + "PLW", # Pylint: Warning + # "PT", # flake8-pytest-style + # "PTH", # flake8-use-pathlib + # "PYI", # flake8-pyi + "Q", # flake8-quotes + # "RET", # flake8-return + # "RSE", # flake8-raise + "RUF", # Ruff-specific rules + "S", # flake8-bandit + # "SIM", # flake8-simplify + # "SLF", # flake8-self + "T10", # flake8-debugger + "T20", # flake8-print + # "TCH", # flake8-type-checking + # "TD", # flake8-todos + "TID", # flake8-tidy-imports + # "TRY", # tryceratops + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow use of date.today + "DTZ011", + # Remove flake8-errmsg since we consider they bloat the code and provide limited value + "EM", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore warnings on subprocess.run / popen + "S603", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["gutenberg2zim"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.pytest.ini_options] +minversion = "7.3" +testpaths = ["tests"] +pythonpath = [".", "src"] + +[tool.coverage.paths] +great_project = ["src/gutenberg2zim"] +tests = ["tests"] + +[tool.coverage.run] +source_pkgs = ["gutenberg2zim"] +branch = true +parallel = true +omit = ["src/gutenberg2zim/__about__.py"] + +[tool.coverage.report] +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pyright] +include = ["src", "tests", "tasks.py"] +exclude = ["**/node_modules", "**/__pycache__", "src/gutenberg2zim/templates"] +extraPaths = ["src"] +pythonVersion = "3.11" +pythonPlatform = "All" +typeCheckingMode = "basic" diff --git a/requirements.pip b/requirements.pip deleted file mode 100644 index 90fbffc..0000000 --- a/requirements.pip +++ /dev/null @@ -1,17 +0,0 @@ -beautifulsoup4==4.9.3 -Jinja2==3.1.2 -ipdb==0.13.11 -ipython==8.8.0 -peewee==3.15.4 -path.py==12.5.0 -Babel==2.11.0 -lxml==4.9.2 -docopt==0.6.2 -chardet==5.1.0 -apsw==3.40.0.0 -kiwixstorage>=0.5,<1.0 -requests>=2.23,<3.0 -pif==0.8.2 -zimscraperlib>=2.1,<2.2 -schedule>=1.1.0,<1.2 -backoff==2.2.1 diff --git a/setup.py b/setup.py deleted file mode 100755 index a7bd296..0000000 --- a/setup.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - -""" Project Gutemberg ZIM creator for Offline Use """ - -import pathlib - -import subprocess - -from codecs import open - -from setuptools import setup, find_packages - -from gutenbergtozim import VERSION - -root_dir = pathlib.Path(__file__).parent - -print("Getting JS dependencies...") -subprocess.run([str(root_dir.joinpath("get_js_deps.sh").resolve())], check=True) - -with open("pypi-readme.rst", "r", "utf-8") as f: - readme = f.read() - -with open("requirements.pip", "r") as f: - requirements = [l.strip() for l in f.readlines() if len(l.strip())] - -setup( - name="gutenberg2zim", - version=VERSION, - description=__doc__, - long_description=readme, - author="Kiwix", - author_email="reg@kiwix.org", - url="http://github.com/openzim/gutenberg", - keywords="gutenberg zim kiwix openzim offline", - license="GPL-3.0", - packages=find_packages("."), - zip_safe=False, - platforms="any", - include_package_data=True, - data_files=["pypi-readme.rst", "LICENSE", "requirements.pip"], - package_dir={"gutenberg": "gutenberg"}, - install_requires=requirements, - scripts=["gutenberg2zim"], - classifiers=[ - "Intended Audience :: Developers", - "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", - "Programming Language :: Python", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3.6", - ], -) diff --git a/gutenbergtozim/.jsbeautifyrc b/src/gutenberg2zim/.jsbeautifyrc similarity index 100% rename from gutenbergtozim/.jsbeautifyrc rename to src/gutenberg2zim/.jsbeautifyrc diff --git a/src/gutenberg2zim/__about__.py b/src/gutenberg2zim/__about__.py new file mode 100644 index 0000000..e3a24a5 --- /dev/null +++ b/src/gutenberg2zim/__about__.py @@ -0,0 +1 @@ +__version__ = "2.1.0-dev0" diff --git a/src/gutenberg2zim/__init__.py b/src/gutenberg2zim/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gutenberg2zim/__main__.py b/src/gutenberg2zim/__main__.py new file mode 100644 index 0000000..b119379 --- /dev/null +++ b/src/gutenberg2zim/__main__.py @@ -0,0 +1,4 @@ +from gutenberg2zim.entrypoint import main + +if __name__ == "__main__": + main() diff --git a/gutenbergtozim/checkdeps.py b/src/gutenberg2zim/checkdeps.py similarity index 78% rename from gutenbergtozim/checkdeps.py rename to src/gutenberg2zim/checkdeps.py index 369ee61..c44d510 100644 --- a/gutenbergtozim/checkdeps.py +++ b/src/gutenberg2zim/checkdeps.py @@ -1,10 +1,6 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import subprocess -from gutenbergtozim import logger +from gutenberg2zim.constants import logger def check_dependencies(): @@ -34,9 +30,9 @@ def bin_is_present(binary): all_good = True has_zimwriter = True - for bin, msg in all_bins.items(): - if not bin_is_present(bin): - logger.error("\t*{}* binary missing. {}".format(bin, msg)) + for binary, msg in all_bins.items(): + if not bin_is_present(binary): + logger.error(f"\t*{binary}* binary missing. {msg}") all_good = False return all_good, has_zimwriter diff --git a/src/gutenberg2zim/constants.py b/src/gutenberg2zim/constants.py new file mode 100644 index 0000000..259ec3c --- /dev/null +++ b/src/gutenberg2zim/constants.py @@ -0,0 +1,24 @@ +import logging +import pathlib + +from zimscraperlib.logging import getLogger + +from gutenberg2zim.__about__ import __version__ + +ROOT_DIR = pathlib.Path(__file__).parent +NAME = ROOT_DIR.name + +VERSION = __version__ + +SCRAPER = f"{NAME} {VERSION}" + +# when modifiying this list, update list in hatch_build.py as well +JS_DEPS: list[str] = [ + "datatables/datatables.min.css", + "datatables/datatables.min.js", +] + +logger = getLogger(__name__, level=logging.DEBUG) + +TMP_FOLDER = "tmp" +TMP_FOLDER_PATH = pathlib.Path(TMP_FOLDER) diff --git a/gutenbergtozim/database.py b/src/gutenberg2zim/database.py similarity index 82% rename from gutenbergtozim/database.py rename to src/gutenberg2zim/database.py index 4d8368f..fa68bb1 100644 --- a/gutenbergtozim/database.py +++ b/src/gutenberg2zim/database.py @@ -1,10 +1,7 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - from peewee import ( BooleanField, CharField, + DoesNotExist, ForeignKeyField, IntegerField, Model, @@ -12,7 +9,7 @@ ) from playhouse.apsw_ext import APSWDatabase -from gutenbergtozim import logger +from gutenberg2zim.constants import logger timeout = 10 db = APSWDatabase( @@ -33,14 +30,14 @@ class BaseModel(Model): def get_or_none(cls, *query, **kwargs): try: return cls.get(*query, **kwargs) - except cls.DoesNotExist: + except DoesNotExist: return None class License(BaseModel): class Meta: database = db - fixtures = [ + fixtures = ( {"slug": "PD", "name": "Public domain in the USA."}, {"slug": "None", "name": "None"}, { @@ -49,19 +46,19 @@ class Meta: "notice inside this book " "for details.", }, - ] + ) slug = CharField(max_length=20, primary_key=True) name = CharField() - def __unicode__(self): + def __str__(self): return self.name class Author(BaseModel): class Meta: database = db - fixtures = [ + fixtures = ( { "gut_id": "116", "last_name": "Various", @@ -70,7 +67,7 @@ class Meta: "gut_id": "216", "last_name": "Anonymous", }, - ] + ) gut_id = CharField(primary_key=True, max_length=100) last_name = CharField(max_length=150) @@ -78,11 +75,11 @@ class Meta: birth_year = CharField(max_length=10, null=True) death_year = CharField(max_length=10, null=True) - def __unicode__(self): + def __str__(self): return self.name() def fname(self): - return "{name}.{id}".format(name=self.name(), id=self.gut_id) + return f"{self.name()}.{self.gut_id}" def name(self): def sanitize(text): @@ -97,7 +94,7 @@ def sanitize(text): if not self.last_name: return sanitize(self.first_names) - return sanitize("{fn} {ln}".format(ln=self.last_name, fn=self.first_names)) + return sanitize(f"{self.first_names} {self.last_name}") def to_dict(self): return { @@ -124,11 +121,11 @@ class Book(BaseModel): class Meta: database = db - id = IntegerField(primary_key=True) + id = IntegerField(primary_key=True) # noqa: A003 title = CharField(max_length=500) subtitle = CharField(max_length=500, null=True) author = ForeignKeyField(Author, related_name="books") - license = ForeignKeyField(License, related_name="books") + license = ForeignKeyField(License, related_name="books") # noqa: A003 language = CharField(max_length=10) downloads = IntegerField(default=0) bookshelf = CharField(max_length=500, null=True) @@ -138,8 +135,8 @@ class Meta: epub_etag = CharField(max_length=500, null=True) cover_etag = CharField(max_length=500, null=True) - def __unicode__(self): - return "{}/{}/{}".format(self.id, self.title, self.bookshelf) + def __str__(self): + return f"{self.id}/{self.title}/{self.bookshelf}" def to_dict(self): return { @@ -172,7 +169,7 @@ def to_array(self, all_requested_formats): ] def formats(self): - from gutenbergtozim.utils import main_formats_for + from gutenberg2zim.utils import main_formats_for return main_formats_for(self) @@ -192,8 +189,9 @@ class Meta: pattern = CharField(max_length=100) downloaded_from = CharField(max_length=300, null=True) - def __unicode__(self): - return "[{}] {}".format(self.mime, self.book.title) + def __str__(self): + return f"[{self.mime}] {self.book.title}" + class Url(BaseModel): class Meta: @@ -201,19 +199,19 @@ class Meta: url = TextField(index=True) - def __unicode__(self): + def __str__(self): return self.url def load_fixtures(model): - logger.info("Loading fixtures for {}".format(model._meta.name)) + logger.info(f"Loading fixtures for {model._meta.name}") for fixture in getattr(model._meta, "fixtures", []): f = model.create(**fixture) - logger.debug("[fixtures] Created {}".format(f)) + logger.debug(f"[fixtures] Created {f}") -def setup_database(wipe=False): +def setup_database(*, wipe=False): logger.info("Setting up the database") for model in (License, Author, Book, BookFormat, Url): @@ -221,7 +219,7 @@ def setup_database(wipe=False): model.drop_table(fail_silently=True) if not model.table_exists(): model.create_table() - logger.debug("Created table for {}".format(model._meta.name)) + logger.debug(f"Created table for {model._meta.name}") # type: ignore load_fixtures(model) else: - logger.debug("{} table already exists.".format(model._meta.name)) + logger.debug(f"{model._meta.name} table already exists.") # type: ignore diff --git a/gutenbergtozim/download.py b/src/gutenberg2zim/download.py similarity index 78% rename from gutenbergtozim/download.py rename to src/gutenberg2zim/download.py index 6907e2a..3a38c89 100644 --- a/gutenbergtozim/download.py +++ b/src/gutenberg2zim/download.py @@ -1,7 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import os import pathlib import shutil @@ -12,14 +8,15 @@ import apsw import backoff -from path import Path as path - -from gutenbergtozim import TMP_FOLDER, logger -from gutenbergtozim.database import Book, BookFormat -from gutenbergtozim.export import fname_for, get_list_of_filtered_books -from gutenbergtozim.s3 import download_from_cache -from gutenbergtozim.urls import get_urls -from gutenbergtozim.utils import ( +from kiwixstorage import KiwixStorage +from path import Path + +from gutenberg2zim.constants import TMP_FOLDER, logger +from gutenberg2zim.database import Book, BookFormat +from gutenberg2zim.export import fname_for, get_list_of_filtered_books +from gutenberg2zim.s3 import download_from_cache +from gutenberg2zim.urls import get_urls +from gutenberg2zim.utils import ( FORMAT_MATRIX, archive_name_for, download_file, @@ -39,15 +36,15 @@ # return False -def handle_zipped_epub(zippath, book, dst_dir): +def handle_zipped_epub(zippath, book, dst_dir: pathlib.Path): def clfn(fn): return os.path.join(*os.path.split(fn)[1:]) def is_safe(fname): fname = ensure_unicode(clfn(fname)) - if path(fname).basename() == fname: + if Path(fname).basename() == fname: return True - return fname == os.path.join("images", path(fname).splitpath()[-1]) + return fname == os.path.join("images", Path(fname).splitpath()[-1]) zipped_files = [] # create temp directory to extract to @@ -56,7 +53,7 @@ def is_safe(fname): with zipfile.ZipFile(zippath, "r") as zf: # check that there is no insecure data (absolute names) if sum([1 for n in zf.namelist() if not is_safe(ensure_unicode(n))]): - path(tmpd).rmtree_p() + Path(tmpd).rmtree_p() return False # zipped_files = [clfn(fn) for fn in zf.namelist()] zipped_files = zf.namelist() @@ -74,18 +71,18 @@ def is_safe(fname): sum([1 for f in zipped_files if f.endswith("html") or f.endswith(".htm")]) > 1 ) # move all extracted files to proper locations - for fname in zipped_files: + for zipped_file in zipped_files: # skip folders - if not path(fname).ext: + if not Path(zipped_file).ext: continue - src = os.path.join(tmpd, fname) + src = os.path.join(tmpd, zipped_file) if os.path.exists(src): - fname = path(fname).basename() + fname = Path(zipped_file).basename() if fname.endswith(".html") or fname.endswith(".htm"): if mhtml: - if fname.startswith("{}-h.".format(book.id)): + if fname.startswith(f"{book.id}-h."): dst = dst_dir.joinpath(f"{book.id}.html") else: dst = dst_dir.joinpath(f"{book.id}_{fname}") @@ -94,29 +91,34 @@ def is_safe(fname): else: dst = dst_dir.joinpath(f"{book.id}_{fname}") try: - path(src).move(dst) + Path(src).move(str(dst)) except Exception as e: import traceback - print(e) - print("".join(traceback.format_exc())) + print(e) # noqa: T201 + print("".join(traceback.format_exc())) # noqa: T201 raise - # import ipdb; ipdb.set_trace() # delete temp directory and zipfile - if path(zippath).exists(): + if Path(zippath).exists(): os.unlink(zippath) - path(tmpd).rmtree_p() + Path(tmpd).rmtree_p() def download_book( - book, download_cache, languages, formats, force, s3_storage, optimizer_version + book: Book, + download_cache: str, + formats: list[str], + *, + force: bool, + s3_storage: KiwixStorage | None, + optimizer_version: dict[str, str] | None, ): - logger.info("\tDownloading content files for Book #{id}".format(id=book.id)) + logger.info(f"\tDownloading content files for Book #{book.id}") # apply filters if not formats: - formats = FORMAT_MATRIX.keys() + formats = list(FORMAT_MATRIX.keys()) # HTML is our base for ZIM for add it if not present if "html" not in formats: @@ -127,7 +129,6 @@ def download_book( unoptimized_dir = book_dir.joinpath("unoptimized") unsuccessful_formats = [] for book_format in formats: - unoptimized_fpath = unoptimized_dir.joinpath(fname_for(book, book_format)) optimized_fpath = optimized_dir.joinpath(archive_name_for(book, book_format)) @@ -185,53 +186,37 @@ def download_book( bfso = bfs bfs = bfs.filter(BookFormat.pattern << patterns) if not bfs.count(): - pp( - list( - [ - (bf.mime, bf.images, bf.pattern) - for bf in bfs - ] - ) - ) - pp( - list( - [ - (bf.mime, bf.images, bf.pattern) - for bf in bfso - ] - ) - ) + pp([(bf.mime, bf.images, bf.pattern) for bf in bfs]) # noqa: T203 + pp([(bf.mime, bf.images, bf.pattern) for bf in bfso]) # noqa: T203 logger.error("html not found") unsuccessful_formats.append(book_format) continue else: - bfs = bfs.filter(mime=FORMAT_MATRIX.get(book_format)) + bfs = bfs.filter(mime=FORMAT_MATRIX.get(book_format)) # type: ignore if not bfs.count(): - logger.debug( - "[{}] not avail. for #{}# {}".format(book_format, book.id, book.title) - ) + logger.debug(f"[{book_format}] not avail. for #{book.id}# {book.title}") unsuccessful_formats.append(book_format) continue if bfs.count() > 1: try: - bf = bfs.filter(images).get() + bf = bfs.filter(bfs.images).get() except Exception: bf = bfs.get() else: bf = bfs.get() - logger.debug( - "[{}] Requesting URLs for #{}# {}".format(book_format, book.id, book.title) - ) + logger.debug(f"[{book_format}] Requesting URLs for #{book.id}# {book.title}") # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) - urls = list(reversed(urld.get(FORMAT_MATRIX.get(book_format)))) + urls = list( + reversed(urld.get(FORMAT_MATRIX.get(book_format))) # type: ignore + ) import copy @@ -263,10 +248,10 @@ def download_book( downloaded_from_cache = True break if not download_file(url, zpath): - logger.error("ZIP file download failed: {}".format(zpath)) + logger.error(f"ZIP file download failed: {zpath}") continue # save etag - book.html_etag = etag + book.html_etag = etag # type: ignore book.save() # extract zipfile handle_zipped_epub(zippath=zpath, book=book, dst_dir=unoptimized_dir) @@ -293,7 +278,7 @@ def download_book( downloaded_from_cache = True break if not download_file(url, unoptimized_fpath): - logger.error("file donwload failed: {}".format(unoptimized_fpath)) + logger.error(f"file donwload failed: {unoptimized_fpath}") continue # save etag if html or epub if download is successful if ( @@ -302,11 +287,11 @@ def download_book( or url.endswith(".html.utf8") ): logger.debug(f"Saving html ETag for {book.id}") - book.html_etag = etag + book.html_etag = etag # type: ignore book.save() elif url.endswith(".epub"): logger.debug(f"Saving epub ETag for {book.id}") - book.epub_etag = etag + book.epub_etag = etag # type: ignore book.save() # store working URL in DB @@ -316,18 +301,18 @@ def download_book( break if not bf.downloaded_from and not downloaded_from_cache: - logger.error("NO FILE FOR #{}/{}".format(book.id, book_format)) + logger.error(f"NO FILE FOR #{book.id}/{book_format}") # delete instance from DB if download failed logger.info("Deleting instance from DB") bf.delete_instance() unsuccessful_formats.append(book_format) - pp(allurls) + pp(allurls) # noqa: T203 # delete book from DB if not downloaded in any format if len(unsuccessful_formats) == len(formats): logger.debug( f"Book #{book.id} could not be downloaded in any format. " - + "Deleting from DB ..." + "Deleting from DB ..." ) book.delete_instance() if book_dir.exists(): @@ -340,10 +325,10 @@ def download_cover(book, book_dir, s3_storage, optimizer_version): has_cover = Book.select(Book.cover_page).where(Book.id == book.id) if has_cover: # try to download optimized cover from cache if s3_storage - url = "{}{}/pg{}.cover.medium.jpg".format(IMAGE_BASE, book.id, book.id) + url = f"{IMAGE_BASE}{book.id}/pg{book.id}.cover.medium.jpg" etag = get_etag_from_url(url) downloaded_from_cache = False - cover = "{}_cover_image.jpg".format(book.id) + cover = f"{book.id}_cover_image.jpg" if ( book_dir.joinpath("optimized").joinpath(cover).exists() or book_dir.joinpath("unoptimized").joinpath(cover).exists() @@ -363,30 +348,31 @@ def download_cover(book, book_dir, s3_storage, optimizer_version): optimizer_version=optimizer_version, ) if not downloaded_from_cache: - logger.debug("Downloading {}".format(url)) + logger.debug(f"Downloading {url}") if download_file(url, book_dir.joinpath("unoptimized").joinpath(cover)): book.cover_etag = etag book.save() else: - logger.debug("No Book Cover found for Book #{}".format(book.id)) + logger.debug(f"No Book Cover found for Book #{book.id}") def download_all_books( - download_cache, - concurrency, - languages=[], - formats=[], - only_books=[], - force=False, - s3_storage=None, - optimizer_version=None, + download_cache: str, + concurrency: int, + languages: list[str], + formats: list[str], + only_books: list[str], + *, + force: bool, + s3_storage: KiwixStorage | None, + optimizer_version: dict[str, str] | None, ): available_books = get_list_of_filtered_books( languages=languages, formats=formats, only_books=only_books ) # ensure dir exist - path(download_cache).mkdir_p() + Path(download_cache).mkdir_p() def backoff_busy_error_hdlr(details): logger.warning( @@ -403,7 +389,12 @@ def backoff_busy_error_hdlr(details): ) def dlb(b): return download_book( - b, download_cache, languages, formats, force, s3_storage, optimizer_version + book=b, + download_cache=download_cache, + formats=formats, + force=force, + s3_storage=s3_storage, + optimizer_version=optimizer_version, ) Pool(concurrency).map(dlb, available_books) diff --git a/gutenberg2zim b/src/gutenberg2zim/entrypoint.py similarity index 53% rename from gutenberg2zim rename to src/gutenberg2zim/entrypoint.py index bb28621..eb111bd 100755 --- a/gutenberg2zim +++ b/src/gutenberg2zim/entrypoint.py @@ -1,23 +1,19 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import os import sys from docopt import docopt -from path import Path as path - -from gutenbergtozim import VERSION, logger -from gutenbergtozim.checkdeps import check_dependencies -from gutenbergtozim.database import setup_database -from gutenbergtozim.download import download_all_books -from gutenbergtozim.rdf import parse_and_fill, download_rdf_file, get_rdf_fpath -from gutenbergtozim.s3 import s3_credentials_ok -from gutenbergtozim.urls import setup_urls -from gutenbergtozim.zim import build_zimfile - -help = ( +from path import Path + +from gutenberg2zim.checkdeps import check_dependencies +from gutenberg2zim.constants import TMP_FOLDER_PATH, VERSION, logger +from gutenberg2zim.database import setup_database +from gutenberg2zim.download import download_all_books +from gutenberg2zim.rdf import download_rdf_file, get_rdf_fpath, parse_and_fill +from gutenberg2zim.s3 import s3_credentials_ok +from gutenberg2zim.urls import setup_urls +from gutenberg2zim.zim import build_zimfile + +help_info = ( """Usage: gutenberg2zim [-y] [-F] [-l LANGS] [-f FORMATS] """ """[-d CACHE_PATH] [-e STATIC_PATH] """ """[-z ZIM_PATH] [-u RDF_URL] [-b BOOKS] """ @@ -73,92 +69,93 @@ ) -def main(arguments): +def main(): + arguments = docopt(help_info, version=VERSION) + # optimizer version to use - OPTIMIZER_VERSION = {"html": "v1", "epub": "v1", "cover": "v1"} + optimizer_version = {"html": "v1", "epub": "v1", "cover": "v1"} # actions constants - DO_PREPARE = arguments.get("--prepare", False) - DO_PARSE = arguments.get("--parse", False) - DO_DOWNLOAD = arguments.get("--download", False) - DO_ZIM = arguments.get("--zim", False) - DO_CHECKDEPS = arguments.get("--check", False) - ONE_LANG_ONE_ZIM_FOLDER = arguments.get("--one-language-one-zim") or None - COMPLETE_DUMP = arguments.get("--complete", False) - - ZIM_NAME = arguments.get("--zim-file") - WIPE_DB = arguments.get("--wipe-db") or False - RDF_URL = ( + do_prepare = arguments.get("--prepare", False) + do_parse = arguments.get("--parse", False) + do_download = arguments.get("--download", False) + do_zim = arguments.get("--zim", False) + do_checkdeps = arguments.get("--check", False) + one_lang_one_zim_folder = arguments.get("--one-language-one-zim") or None + complete_dump = arguments.get("--complete", False) + + zim_name = arguments.get("--zim-file") + wipe_db = arguments.get("--wipe-db") or False + rdf_url = ( arguments.get("--rdf-url") or "http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2" ) - DL_CACHE = arguments.get("--dl-folder") or os.path.join("dl-cache") - BOOKS = arguments.get("--books") or "" - ZIM_TITLE = arguments.get("--zim-title") - ZIM_DESC = arguments.get("--zim-desc") - CONCURRENCY = int(arguments.get("--concurrency") or 16) - DL_CONCURRENCY = int(arguments.get("--dlc") or CONCURRENCY) - FORCE = arguments.get("--force", False) - TITLE_SEARCH = arguments.get("--title-search", False) - BOOKSHELVES = arguments.get("--bookshelves", False) - OPTIMIZATION_CACHE = arguments.get("--optimization-cache") or None - USE_ANY_OPTIMIZED_VERSION = arguments.get("--use-any-optimized-version", False) - STATS_FILENAME = arguments.get("--stats-filename") or None + dl_cache = arguments.get("--dl-folder") or os.path.join("dl-cache") + books_csv = arguments.get("--books") or "" + zim_title = arguments.get("--zim-title") + zim_desc = arguments.get("--zim-desc") + concurrency = int(arguments.get("--concurrency") or 16) + dl_concurrency = int(arguments.get("--dlc") or concurrency) + force = arguments.get("--force", False) + title_search = arguments.get("--title-search", False) + bookshelves = arguments.get("--bookshelves", False) + optimization_cache = arguments.get("--optimization-cache") or None + use_any_optimized_version = arguments.get("--use-any-optimized-version", False) + stats_filename = arguments.get("--stats-filename") or None s3_storage = None - if OPTIMIZATION_CACHE: - s3_storage = s3_credentials_ok(OPTIMIZATION_CACHE) + if optimization_cache: + s3_storage = s3_credentials_ok(optimization_cache) if not s3_storage: raise ValueError("Unable to connect to Optimization Cache. Check its URL.") logger.info("S3 Credentials OK. Continuing ... ") # create tmp dir - path("tmp").mkdir_p() + TMP_FOLDER_PATH.mkdir(parents=True) - LANGUAGES = [ + languages = [ x.strip().lower() for x in (arguments.get("--languages") or "").split(",") if x.strip() ] # special shortcuts for "all" + formats: list[str] if arguments.get("--formats") in ["all", None]: - FORMATS = ["epub", "pdf", "html"] + formats = ["epub", "pdf", "html"] else: - FORMATS = list( - set( - [ - x.strip().lower() - for x in (arguments.get("--formats") or "").split(",") - if x.strip() - ] - ) + formats = list( + { + x.strip().lower() + for x in (arguments.get("--formats") or "").split(",") + if x.strip() + } ) + books = [] try: - BOOKS = [bid for bid in BOOKS.split(",")] + books_csv = books_csv.split(",") def f(x): return list(map(int, [i for i in x.split("-") if i.isdigit()])) - books = [] - for i in BOOKS: + for i in books_csv: blst = f(i) if len(blst) > 1: blst = range(blst[0], blst[1] + 1) books.extend(blst) - BOOKS = list(set(books)) + books_csv = list(set(books)) except Exception as e: logger.error(e) - BOOKS = [] + books_csv = [] # no arguments, default to --complete - if not (DO_PREPARE + DO_PARSE + DO_DOWNLOAD + DO_ZIM): - COMPLETE_DUMP = True + if not (do_prepare + do_parse + do_download + do_zim): + complete_dump = True - if COMPLETE_DUMP: - DO_CHECKDEPS = DO_PREPARE = DO_PARSE = DO_DOWNLOAD = DO_ZIM = True + if complete_dump: + do_checkdeps = do_prepare = do_parse = do_download = do_zim = True - if DO_CHECKDEPS: + if do_checkdeps: logger.info("CHECKING for dependencies on the system") if not check_dependencies()[0]: logger.error("Exiting...") @@ -166,71 +163,65 @@ def f(x): rdf_path = get_rdf_fpath() - if DO_PREPARE: - logger.info("PREPARING rdf-files cache from {}".format(RDF_URL)) - download_rdf_file(rdf_url=RDF_URL, rdf_path=rdf_path) + if do_prepare: + logger.info(f"PREPARING rdf-files cache from {rdf_url}") + download_rdf_file(rdf_url=rdf_url, rdf_path=rdf_path) - if WIPE_DB: + if wipe_db: logger.info("RESETING DATABASE IF EXISTS") logger.info("SETTING UP DATABASE") - setup_database(wipe=WIPE_DB) + setup_database(wipe=wipe_db) - if DO_PARSE: - logger.info("PARSING rdf-files in {}".format(rdf_path)) - parse_and_fill(rdf_path=rdf_path, only_books=BOOKS, force=FORCE) + if do_parse: + logger.info(f"PARSING rdf-files in {rdf_path}") + parse_and_fill(rdf_path=rdf_path, only_books=books) logger.info("Add possible url to db") - setup_urls(force=FORCE) + setup_urls(force=force) - if DO_DOWNLOAD: + if do_download: logger.info("DOWNLOADING ebooks from mirror using filters") download_all_books( - download_cache=DL_CACHE, - concurrency=DL_CONCURRENCY, - languages=LANGUAGES, - formats=FORMATS, - only_books=BOOKS, - force=FORCE, + download_cache=dl_cache, + concurrency=dl_concurrency, + languages=languages, + formats=formats, + only_books=books, + force=force, s3_storage=s3_storage, - optimizer_version=OPTIMIZER_VERSION - if not USE_ANY_OPTIMIZED_VERSION + optimizer_version=optimizer_version + if not use_any_optimized_version else None, ) - if ONE_LANG_ONE_ZIM_FOLDER: - if LANGUAGES == []: + if one_lang_one_zim_folder: + if languages == []: zims = [] - from gutenbergtozim.database import Book + from gutenberg2zim.database import Book for book in Book.select(Book.language).distinct(): zims.append([book.language]) zims.append([]) else: - zims = [[lang] for lang in LANGUAGES] + [LANGUAGES] + zims = [[lang] for lang in languages] + [languages] else: - zims = [LANGUAGES] + zims = [languages] for zim_lang in zims: - - if DO_ZIM: - + if do_zim: logger.info("BUILDING ZIM dynamically") build_zimfile( - output_folder=path(ONE_LANG_ONE_ZIM_FOLDER or ".").abspath(), - download_cache=DL_CACHE, - concurrency=CONCURRENCY, + output_folder=Path(one_lang_one_zim_folder or ".").abspath(), + download_cache=dl_cache, + concurrency=concurrency, languages=zim_lang, - formats=FORMATS, - only_books=BOOKS, - force=FORCE, - title_search=TITLE_SEARCH, - add_bookshelves=BOOKSHELVES, + formats=formats, + only_books=books, + force=force, + title_search=title_search, + add_bookshelves=bookshelves, s3_storage=s3_storage, - optimizer_version=OPTIMIZER_VERSION, - zim_name=path(ZIM_NAME).name if ZIM_NAME else None, - title=ZIM_TITLE, - description=ZIM_DESC, - stats_filename=STATS_FILENAME, + optimizer_version=optimizer_version, + zim_name=Path(zim_name).name if zim_name else None, + title=zim_title, + description=zim_desc, + stats_filename=stats_filename, ) - - -if __name__ == "__main__": - main(docopt(help, version=VERSION)) diff --git a/gutenbergtozim/export.py b/src/gutenberg2zim/export.py similarity index 80% rename from gutenbergtozim/export.py rename to src/gutenberg2zim/export.py index bb8322d..20b0e63 100644 --- a/gutenbergtozim/export.py +++ b/src/gutenberg2zim/export.py @@ -1,34 +1,29 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import json import os import pathlib import shutil import tempfile import traceback -import urllib +import urllib.parse import zipfile from multiprocessing.dummy import Pool import bs4 -import six from bs4 import BeautifulSoup from jinja2 import Environment, PackageLoader -from path import Path as path +from path import Path from schedule import every from six import text_type from zimscraperlib.image.transformation import resize_image -import gutenbergtozim -from gutenbergtozim import TMP_FOLDER, TMP_FOLDER_PATH, logger -from gutenbergtozim.database import Author, Book, BookFormat -from gutenbergtozim.iso639 import language_name -from gutenbergtozim.l10n import l10n_strings -from gutenbergtozim.s3 import upload_to_cache -from gutenbergtozim.shared import Global -from gutenbergtozim.utils import ( +import gutenberg2zim +from gutenberg2zim.constants import TMP_FOLDER, TMP_FOLDER_PATH, logger +from gutenberg2zim.database import Author, Book, BookFormat +from gutenberg2zim.iso639 import language_name +from gutenberg2zim.l10n import l10n_strings +from gutenberg2zim.s3 import upload_to_cache +from gutenberg2zim.shared import Global +from gutenberg2zim.utils import ( FORMAT_MATRIX, UTF8, archive_name_for, @@ -46,7 +41,9 @@ zip_epub, ) -jinja_env = Environment(loader=PackageLoader("gutenbergtozim", "templates")) +jinja_env = Environment( # noqa: S701 + loader=PackageLoader("gutenberg2zim", "templates") +) DEBUG_COUNT = [] NB_POPULARITY_STARS = 5 @@ -61,7 +58,6 @@ def get_ui_languages_for(books): def get_default_context(project_id, books): - return { "l10n_strings": json.dumps(l10n_strings), "ui_languages": get_ui_languages_for(books), @@ -70,28 +66,25 @@ def get_default_context(project_id, books): } -def fa_for_format(format): +def fa_for_format(book_format): return { "html": "", "info": "fa-info-circle", "epub": "fa-download", "pdf": "fa-file-pdf-o", - }.get(format, "fa-file-o") + }.get(book_format, "fa-file-o") -def zim_link_prefix(format): - return "../{}/".format({"html": "A", "epub": "I", "pdf": "I"}.get(format)) +def zim_link_prefix(book_format): + return "../{}/".format({"html": "A", "epub": "I", "pdf": "I"}.get(book_format)) def urlencode(url): - if six.PY2: - return urllib.quote(url.encode(UTF8)) - else: - return urllib.parse.quote(url) + return urllib.parse.quote(url) def save_bs_output(soup, fpath, encoding=UTF8): - save_file(soup if six.PY2 else str(soup), fpath, encoding) + save_file(str(soup), fpath, encoding) jinja_env.filters["book_name_for_fs"] = book_name_for_fs @@ -102,11 +95,11 @@ def save_bs_output(soup, fpath, encoding=UTF8): def tmpl_path(): - return os.path.join(path(gutenbergtozim.__file__).parent, "templates") + return os.path.join(Path(gutenberg2zim.__file__).parent, "templates") def get_list_of_all_languages(): - return list(set(list([b.language for b in Book.select(Book.language)]))) + return list({b.language for b in Book.select(Book.language)}) def export_illustration(): @@ -129,7 +122,6 @@ def export_skeleton( title_search, add_bookshelves, ): - context = get_default_context(project_id, books=books) context.update( { @@ -143,7 +135,7 @@ def export_skeleton( rendered = jinja_env.get_template("js/l10n.js").render(**context) Global.add_item_for( path="js/l10n.js", - content=rendered, + content=rendered, # type: ignore mimetype="text/javascript", is_front=True, ) @@ -179,7 +171,7 @@ def export_skeleton( rendered = template.render(**context) Global.add_item_for( path=tpl_path, - content=rendered, + content=rendered, # type: ignore mimetype="text/html", is_front=True, ) @@ -187,19 +179,18 @@ def export_skeleton( def export_all_books( project_id, - download_cache=None, - concurrency=None, - languages=[], - formats=[], - only_books=[], - force=False, - title_search=False, - add_bookshelves=False, - s3_storage=None, - optimizer_version=None, - stats_filename=None, + download_cache, + concurrency, + languages, + formats, + only_books, + force, + title_search, + add_bookshelves, + s3_storage, + optimizer_version, + stats_filename, ): - books = get_list_of_filtered_books( languages=languages, formats=formats, only_books=only_books ) @@ -237,10 +228,8 @@ def nb_by_fmt(fmt): # export to JSON helpers export_to_json_helpers( books=books, - languages=languages, formats=formats, project_id=project_id, - title_search=title_search, add_bookshelves=add_bookshelves, ) @@ -285,7 +274,6 @@ def dlb(b): export_book( b, book_dir=pathlib.Path(download_cache).joinpath(str(b.id)), - languages=languages, formats=formats, books=books, project_id=project_id, @@ -315,22 +303,21 @@ def report_progress(stats_filename=None): def html_content_for(book, src_dir): - html_fpath = src_dir.joinpath(fname_for(book, "html")) # is HTML file present? if not html_fpath.exists(): - logger.warn("Missing HTML content for #{} at {}".format(book.id, html_fpath)) + logger.warn(f"Missing HTML content for #{book.id} at {html_fpath}") return None, None try: return read_file(html_fpath) except UnicodeDecodeError: - logger.error("Unable to read HTML content: {}".format(html_fpath)) + logger.error(f"Unable to read HTML content: {html_fpath}") raise -def update_html_for_static(book, html_content, formats, epub=False): +def update_html_for_static(book, html_content, formats, *, epub=False): soup = BeautifulSoup(html_content, "lxml-html") # remove encoding as we're saving to UTF8 anyway @@ -344,7 +331,7 @@ def update_html_for_static(book, html_content, formats, epub=False): elif "content" in meta.attrs and "charset=" in meta.attrs.get("content"): try: ctype, _ = meta.attrs.get("content").split(";", 1) - except Exception: + except Exception: # noqa: S112 continue else: encoding_specified = True @@ -359,9 +346,7 @@ def update_html_for_static(book, html_content, formats, epub=False): if not epub: for img in soup.findAll("img"): if "src" in img.attrs: - img.attrs["src"] = img.attrs["src"].replace( - "images/", "{id}_".format(id=book.id) - ) + img.attrs["src"] = img.attrs["src"].replace("images/", f"{book.id}_") # update all links to internal HTML pages # should only apply to relative URLs to HTML files. @@ -376,7 +361,7 @@ def replacablement_link(book, url): return None if len(urlp.strip()): - nurl = "{id}_{url}".format(id=book.id, url=urlp) + nurl = f"{book.id}_{urlp}" else: nurl = "" @@ -399,9 +384,9 @@ def replacablement_link(book, url): head = soup.find("head") if not head: head = soup.new_tag("head") - soup.html.insert(0, head) + soup.html.insert(0, head) # type: ignore head.append(soup.new_tag("title")) - soup.title.string = book.title + soup.title.string = book.title # type: ignore patterns = [ ( @@ -413,8 +398,8 @@ def replacablement_link(book, url): "***END OF THE PROJECT GUTENBERG EBOOK", ), ( - "<><><><><><><><><><><><><><><><><><><><><><><><><><><><>" "<><><><><><>", - "<><><><><><><><><><><><><><><><><><><><><><><><><><><><>" "<><><><><><>", + "<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>", + "<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>", ), # ePub only ("*** START OF THIS PROJECT GUTENBERG EBOOK", "*** START: FULL LICENSE ***"), @@ -446,13 +431,19 @@ def replacablement_link(book, url): ), ("Project Gutenberg Etext", "End of Project Gutenberg Etext"), ("Text encoding is iso-8859-1", "Fin de Project Gutenberg Etext"), - ("—————————————————-", "Encode an ISO 8859/1 " "Etext into LaTeX or HTML"), + ("—————————————————-", "Encode an ISO 8859/1 Etext into LaTeX or HTML"), ] body = soup.find("body") try: is_encapsulated_in_div = ( - sum([1 for e in body.children if not isinstance(e, bs4.NavigableString)]) + sum( + [ + 1 + for e in body.children # type: ignore + if not isinstance(e, bs4.NavigableString) + ] + ) == 1 ) except Exception: @@ -463,45 +454,48 @@ def replacablement_link(book, url): if not is_encapsulated_in_div: for start_of_text, end_of_text in patterns: - if start_of_text not in body.text and end_of_text not in body.text: + if ( + start_of_text not in body.text # type: ignore + and end_of_text not in body.text # type: ignore + ): continue - if start_of_text in body.text and end_of_text in body.text: + if start_of_text in body.text and end_of_text in body.text: # type: ignore remove = True - for child in body.children: + for child in body.children: # type: ignore if isinstance(child, bs4.NavigableString): continue if end_of_text in getattr(child, "text", ""): remove = True if start_of_text in getattr(child, "text", ""): - child.decompose() + child.decompose() # type: ignore remove = False if remove: - child.decompose() + child.decompose() # type: ignore break - elif start_of_text in body.text: + elif start_of_text in body.text: # type: ignore # logger.debug("FOUND START: {}".format(start_of_text)) remove = True - for child in body.children: + for child in body.children: # type: ignore if isinstance(child, bs4.NavigableString): continue if start_of_text in getattr(child, "text", ""): - child.decompose() + child.decompose() # type: ignore remove = False if remove: - child.decompose() + child.decompose() # type: ignore break - elif end_of_text in body.text: + elif end_of_text in body.text: # type: ignore # logger.debug("FOUND END: {}".format(end_of_text)) remove = False - for child in body.children: + for child in body.children: # type: ignore if isinstance(child, bs4.NavigableString): continue if end_of_text in getattr(child, "text", ""): remove = True if remove: - child.decompose() + child.decompose() # type: ignore break # build infobox @@ -509,22 +503,22 @@ def replacablement_link(book, url): infobox = jinja_env.get_template("book_infobox.html") infobox_html = infobox.render({"book": book, "formats": formats}) info_soup = BeautifulSoup(infobox_html, "lxml-html") - body.insert(0, info_soup.find("div")) + body.insert(0, info_soup.find("div")) # type: ignore # if there is no charset, set it to utf8 if not epub: meta = BeautifulSoup( - '', + '', "lxml-html", ) head = soup.find("head") html = soup.find("html") if head: - head.insert(0, meta.head.contents[0]) + head.insert(0, meta.head.contents[0]) # type: ignore elif html: - html.insert(0, meta.head) + html.insert(0, meta.head) # type: ignore else: - soup.insert(0, meta.head) + soup.insert(0, meta.head) # type: ignore return html @@ -534,15 +528,15 @@ def replacablement_link(book, url): def cover_html_content_for( book, optimized_files_dir, books, project_id, title_search, add_bookshelves, formats ): - cover_img = "{id}_cover_image.jpg".format(id=book.id) + cover_img = f"{book.id}_cover_image.jpg" cover_img = cover_img if optimized_files_dir.joinpath(cover_img).exists() else None translate_author = ( - ' data-l10n-id="author-{id}"'.format(id=book.author.name().lower()) + f' data-l10n-id="author-{book.author.name().lower()}"' if book.author.name() in ["Anonymous", "Various"] else "" ) translate_license = ( - ' data-l10n-id="license-{id}"'.format(id=book.license.slug.lower()) + f' data-l10n-id="license-{book.license.slug.lower()}"' if book.license.slug in ["PD", "Copyright"] else "" ) @@ -569,11 +563,11 @@ def author_html_content_for(author, books, project_id): return template.render(**context) -def save_author_file(author, books, project_id, force=False): - logger.debug("\t\tSaving author file {} (ID {})".format(author.name(), author)) +def save_author_file(author, books, project_id): + logger.debug(f"\t\tSaving author file {author.name()} (ID {author})") Global.add_item_for( - path="{}.html".format(author.fname()), - content=author_html_content_for(author, books, project_id), + path=f"{author.fname()}.html", + content=author_html_content_for(author, books, project_id), # type: ignore mimetype="text/html", is_front=True, ) @@ -582,7 +576,6 @@ def save_author_file(author, books, project_id, force=False): def export_book( book, book_dir, - languages, formats, books, project_id, @@ -602,13 +595,8 @@ def export_book( handle_unoptimized_files( book=book, src_dir=unoptimized_files_dir, - languages=languages, formats=formats, - books=books, - project_id=project_id, force=force, - title_search=title_search, - add_bookshelves=add_bookshelves, s3_storage=s3_storage, optimizer_version=optimizer_version, ) @@ -628,24 +616,19 @@ def export_book( def handle_unoptimized_files( book, src_dir, - languages, formats, - books, - project_id, optimizer_version, - force=False, - title_search=False, - add_bookshelves=False, - s3_storage=None, + force, + s3_storage, ): def copy_file(src, dst): - logger.info("\t\tCopying from {} to {}".format(src, dst)) + logger.info(f"\t\tCopying from {src} to {dst}") try: shutil.copy2(src, dst) - except IOError: - logger.error("/!\\ Unable to copy missing file {}".format(src)) + except OSError: + logger.error(f"/!\\ Unable to copy missing file {src}") for line in traceback.format_stack(): - print(line.strip()) + print(line.strip()) # noqa: T201 return def update_download_cache(unoptimized_file, optimized_file): @@ -657,10 +640,10 @@ def update_download_cache(unoptimized_file, optimized_file): dst = optimized_dir.joinpath(optimized_file.name) os.unlink(unoptimized_file) copy_file(optimized_file.resolve(), dst.resolve()) - if not [fpath for fpath in unoptimized_dir.iterdir()]: + if not list(unoptimized_dir.iterdir()): unoptimized_dir.rmdir() - logger.info("\tExporting Book #{id}.".format(id=book.id)) + logger.info(f"\tExporting Book #{book.id}.") # actual book content, as HTML html, _ = html_content_for(book=book, src_dir=src_dir) @@ -669,7 +652,7 @@ def update_download_cache(unoptimized_file, optimized_file): article_name = article_name_for(book) article_fpath = TMP_FOLDER_PATH.joinpath(article_name) if not article_fpath.exists() or force: - logger.info("\t\tExporting to {}".format(article_fpath)) + logger.info(f"\t\tExporting to {article_fpath}") try: new_html = update_html_for_static( book=book, html_content=html, formats=formats @@ -684,14 +667,14 @@ def update_download_cache(unoptimized_file, optimized_file): if not src_dir.exists(): return else: - logger.info("\t\tSkipping HTML article {}".format(article_fpath)) + logger.info(f"\t\tSkipping HTML article {article_fpath}") Global.add_item_for(path=article_name, fpath=article_fpath) - def optimize_image(src, dst, force=False): + def optimize_image(src, dst, *, force=False): if dst.exists() and not force: - logger.info("\tSkipping image optimization for {}".format(dst)) + logger.info(f"\tSkipping image optimization for {dst}") return dst - logger.info("\tOptimizing image {}".format(dst)) + logger.info(f"\tOptimizing image {dst}") if src.suffix == ".png": return optimize_png(str(src.resolve()), str(dst.resolve())) if src.suffix in (".jpg", ".jpeg"): @@ -713,7 +696,7 @@ def optimize_jpeg(src, dst): exec_cmd(["jpegoptim", "--strip-all", "-m50", dst]) def optimize_epub(src, dst): - logger.info("\t\tCreating ePUB off {} at {}".format(src, dst)) + logger.info(f"\t\tCreating ePUB off {src} at {dst}") zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) @@ -729,8 +712,7 @@ def optimize_epub(src, dst): remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) - if path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"): - + if Path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"): # special case to remove ugly cover if fname.endswith("cover.jpg") and is_bad_cover(fnp): zipped_files.remove(fname) @@ -738,14 +720,14 @@ def optimize_epub(src, dst): else: optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True) - if path(fname).ext in (".htm", ".html"): + if Path(fname).ext in (".htm", ".html"): html_content, _ = read_file(fnp) html = update_html_for_static( book=book, html_content=html_content, formats=formats, epub=True ) save_bs_output(html, fnp, UTF8) - if path(fname).ext == ".ncx": + if Path(fname).ext == ".ncx": pattern = "*** START: FULL LICENSE ***" ncx, _ = read_file(fnp) soup = BeautifulSoup(ncx, "lxml-xml") @@ -753,17 +735,16 @@ def optimize_epub(src, dst): if pattern in tag.text: s = tag.parent.parent s.decompose() - for s in s.next_siblings: + for s in s.next_siblings: # noqa: B020 s.decompose() - s.next_sibling + s.next_sibling # noqa: B018 save_bs_output(soup, fnp, UTF8) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: - # remove cover - path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p() + Path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p() soup = None opff = os.path.join(tmpd, text_type(book.id), "content.opf") @@ -780,12 +761,13 @@ def optimize_epub(src, dst): # bundle epub as zip zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files) - path(tmpd).rmtree_p() + Path(tmpd).rmtree_p() def handle_companion_file( fname, + book: Book, dstfname=None, - book=None, + *, force=False, as_ext=None, html_file_list=None, @@ -797,13 +779,13 @@ def handle_companion_file( dstfname = fname.name dst = TMP_FOLDER_PATH.joinpath(dstfname) if dst.exists() and not force: - logger.debug("\t\tSkipping already optimized companion {}".format(dstfname)) + logger.debug(f"\t\tSkipping already optimized companion {dstfname}") Global.add_item_for(path=dstfname, fpath=dst) return # optimization based on mime/extension if ext in (".png", ".jpg", ".jpeg", ".gif"): - logger.info("\tCopying and optimizing image companion {}".format(fname)) + logger.info(f"\tCopying and optimizing image companion {fname}") optimize_image(src, dst) Global.add_item_for(path=dstfname, fpath=dst) if dst.name == (f"{book.id}_cover_image.jpg"): @@ -821,7 +803,7 @@ def handle_companion_file( html_file_list.append(dst) update_download_cache(src, dst) elif ext == ".epub": - logger.info("\tCreating optimized EPUB file {}".format(fname)) + logger.info(f"\tCreating optimized EPUB file {fname}") tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER) tmp_epub.close() try: @@ -831,9 +813,15 @@ def handle_companion_file( "\t\tBad zip file. " "Copying as it might be working{}".format(fname) ) - handle_companion_file(fname, dstfname, book, force, as_ext=".zip") + handle_companion_file( + fname=fname, + dstfname=dstfname, + book=book, + force=force, + as_ext=".zip", + ) else: - path(tmp_epub.name).move(dst) + Path(tmp_epub.name).move(str(dst)) Global.add_item_for(path=dstfname, fpath=dst) if s3_storage: upload_to_cache( @@ -850,7 +838,7 @@ def handle_companion_file( if src.name.endswith("_Thumbs.db"): return # copy otherwise (PDF mostly) - logger.info("\tCopying companion file from {} to {}".format(src, dst)) + logger.info(f"\tCopying companion file from {src} to {dst}") copy_file(src, dst) Global.add_item_for(path=dstfname, fpath=dst) if ext != ".pdf" and ext != ".zip" and html_file_list: @@ -864,11 +852,11 @@ def handle_companion_file( src = fpath dst = TMP_FOLDER_PATH.joinpath(fpath.name) if dst.exists() and not force: - logger.debug("\t\tSkipping already optimized HTML {}".format(dst)) + logger.debug(f"\t\tSkipping already optimized HTML {dst}") Global.add_item_for(path=fpath.name, fpath=dst) continue - logger.info("\tExporting HTML file to {}".format(dst)) + logger.info(f"\tExporting HTML file to {dst}") html, _ = read_file(src) new_html = update_html_for_static( book=book, html_content=html, formats=formats @@ -879,7 +867,7 @@ def handle_companion_file( else: try: handle_companion_file( - fpath, + fname=fpath, force=force, html_file_list=html_book_optimized_files, s3_storage=s3_storage, @@ -887,9 +875,7 @@ def handle_companion_file( ) except Exception as e: logger.exception(e) - logger.error( - "\t\tException while handling companion file: {}".format(e) - ) + logger.error(f"\t\tException while handling companion file: {e}") if s3_storage and html_book_optimized_files: upload_to_cache( asset=html_book_optimized_files, @@ -901,24 +887,22 @@ def handle_companion_file( ) # other formats - for format in formats: - if format not in book.formats() or format == "html": + for other_format in formats: + if other_format not in book.formats() or other_format == "html": continue - book_file = src_dir.joinpath(fname_for(book, format)) + book_file = src_dir.joinpath(fname_for(book, other_format)) if book_file.exists(): try: handle_companion_file( - book_file, - archive_name_for(book, format), + fname=book_file, + dstfname=archive_name_for(book, other_format), force=force, book=book, s3_storage=s3_storage, ) except Exception as e: logger.exception(e) - logger.error( - "\t\tException while handling companion file: {}".format(e) - ) + logger.error(f"\t\tException while handling companion file: {e}") def write_book_presentation_article( @@ -934,7 +918,7 @@ def write_book_presentation_article( article_name = article_name_for(book=book, cover=True) cover_fpath = TMP_FOLDER_PATH.joinpath(article_name) if not cover_fpath.exists() or force: - logger.info("\t\tExporting article presentation to {}".format(cover_fpath)) + logger.info(f"\t\tExporting article presentation to {cover_fpath}") html = cover_html_content_for( book=book, optimized_files_dir=optimized_files_dir, @@ -945,12 +929,9 @@ def write_book_presentation_article( formats=formats, ) with open(cover_fpath, "w") as f: - if six.PY2: - f.write(html.encode(UTF8)) - else: - f.write(html) + f.write(html) else: - logger.info("\t\tSkipping already optimized cover {}".format(cover_fpath)) + logger.info(f"\t\tSkipping already optimized cover {cover_fpath}") Global.add_item_for(path=article_name, fpath=cover_fpath) @@ -995,13 +976,11 @@ def bookshelf_list_language(books, lang): ] -def export_to_json_helpers( - books, languages, formats, project_id, title_search, add_bookshelves -): +def export_to_json_helpers(books, formats, project_id, add_bookshelves): def dumpjs(col, fn, var="json_data"): Global.add_item_for( path=fn, - content="var {var} = {content};".format(var=var, content=json.dumps(col)), + content=f"var {var} = {json.dumps(col)};", # type: ignore mimetype="text/javascript", is_front=False, ) @@ -1031,16 +1010,16 @@ def dumpjs(col, fn, var="json_data"): all_filtered_authors = [] # language-specific collections - for lang_name, lang, lang_count in avail_langs: + for _lang_name, lang, _lang_count in avail_langs: lang_filtered_authors = list( - set([book.author.gut_id for book in books.filter(language=lang)]) + {book.author.gut_id for book in books.filter(language=lang)} ) for aid in lang_filtered_authors: if aid not in all_filtered_authors: all_filtered_authors.append(aid) # by popularity - logger.info("\t\tDumping lang_{}_by_popularity.js".format(lang)) + logger.info(f"\t\tDumping lang_{lang}_by_popularity.js") dumpjs( [ book.to_array(all_requested_formats=formats) @@ -1048,10 +1027,10 @@ def dumpjs(col, fn, var="json_data"): Book.downloads.desc() ) ], - "lang_{}_by_popularity.js".format(lang), + f"lang_{lang}_by_popularity.js", ) # by title - logger.info("\t\tDumping lang_{}_by_title.js".format(lang)) + logger.info(f"\t\tDumping lang_{lang}_by_title.js") dumpjs( [ book.to_array(all_requested_formats=formats) @@ -1059,14 +1038,14 @@ def dumpjs(col, fn, var="json_data"): Book.title.asc() ) ], - "lang_{}_by_title.js".format(lang), + f"lang_{lang}_by_title.js", ) authors = authors_from_ids(lang_filtered_authors) - logger.info("\t\tDumping authors_lang_{}.js".format(lang)) + logger.info(f"\t\tDumping authors_lang_{lang}.js") dumpjs( [author.to_array() for author in authors], - "authors_lang_{}.js".format(lang), + f"authors_lang_{lang}.js", "authors_json_data", ) @@ -1079,7 +1058,7 @@ def dumpjs(col, fn, var="json_data"): # dumpjs for bookshelf by popularity # this will allow the popularity button to use this js on the # particular bookshelf page - logger.info("\t\tDumping bookshelf_{}_by_popularity.js".format(bookshelf)) + logger.info(f"\t\tDumping bookshelf_{bookshelf}_by_popularity.js") dumpjs( [ book.to_array(all_requested_formats=formats) @@ -1087,11 +1066,11 @@ def dumpjs(col, fn, var="json_data"): .where(Book.bookshelf == bookshelf) .order_by(Book.downloads.desc()) ], - "bookshelf_{}_by_popularity.js".format(bookshelf), + f"bookshelf_{bookshelf}_by_popularity.js", ) # by title - logger.info("\t\tDumping bookshelf_{}_by_title.js".format(bookshelf)) + logger.info(f"\t\tDumping bookshelf_{bookshelf}_by_title.js") dumpjs( [ book.to_array(all_requested_formats=formats) @@ -1099,13 +1078,11 @@ def dumpjs(col, fn, var="json_data"): .where(Book.bookshelf == bookshelf) .order_by(Book.title.asc()) ], - "bookshelf_{}_by_title.js".format(bookshelf), + f"bookshelf_{bookshelf}_by_title.js", ) # by language - for lang_name, lang, lang_count in avail_langs: - logger.info( - "\t\tDumping bookshelf_{}_by_lang_{}.js".format(bookshelf, lang) - ) + for _lang_name, lang, _lang_count in avail_langs: + logger.info(f"\t\tDumping bookshelf_{bookshelf}_by_lang_{lang}.js") dumpjs( [ book.to_array(all_requested_formats=formats) @@ -1114,7 +1091,7 @@ def dumpjs(col, fn, var="json_data"): .where(Book.bookshelf == bookshelf) .order_by(Book.downloads.desc()) ], - "bookshelf_{}_lang_{}_by_popularity.js".format(bookshelf, lang), + f"bookshelf_{bookshelf}_lang_{lang}_by_popularity.js", ) dumpjs( @@ -1125,14 +1102,14 @@ def dumpjs(col, fn, var="json_data"): .where(Book.bookshelf == bookshelf) .order_by(Book.title.asc()) ], - "bookshelf_{}_lang_{}_by_title.js".format(bookshelf, lang), + f"bookshelf_{bookshelf}_lang_{lang}_by_title.js", ) # dump all bookshelves from any given language - for lang_name, lang, lang_count in avail_langs: - logger.info("\t\tDumping bookshelves_lang_{}.js".format(lang)) + for _lang_name, lang, _lang_count in avail_langs: + logger.info(f"\t\tDumping bookshelves_lang_{lang}.js") temp = bookshelf_list_language(books, lang) - dumpjs(temp, "bookshelves_lang_{}.js".format(lang)) + dumpjs(temp, f"bookshelves_lang_{lang}.js") logger.info("\t\tDumping bookshelves.js") dumpjs(bookshelves, "bookshelves.js", "bookshelves_json_data") @@ -1144,7 +1121,7 @@ def dumpjs(col, fn, var="json_data"): rendered = template.render(**context) Global.add_item_for( path="bookshelf_home.html", - content=rendered, + content=rendered, # type: ignore mimetype="text/html", is_front=False, ) @@ -1165,8 +1142,8 @@ def dumpjs(col, fn, var="json_data"): template = jinja_env.get_template("bookshelf.html") rendered = template.render(**context) Global.add_item_for( - path="{}.html".format(bookshelf), - content=rendered, + path=f"{bookshelf}.html", + content=rendered, # type: ignore mimetype="text/html", is_front=False, ) @@ -1174,10 +1151,9 @@ def dumpjs(col, fn, var="json_data"): # author specific collections authors = authors_from_ids(all_filtered_authors) for author in authors: - # all_filtered_authors.remove(author.gut_id) # by popularity - logger.info("\t\tDumping auth_{}_by_popularity.js".format(author.gut_id)) + logger.info(f"\t\tDumping auth_{author.gut_id}_by_popularity.js") dumpjs( [ book.to_array(all_requested_formats=formats) @@ -1185,10 +1161,10 @@ def dumpjs(col, fn, var="json_data"): Book.downloads.desc() ) ], - "auth_{}_by_popularity.js".format(author.gut_id), + f"auth_{author.gut_id}_by_popularity.js", ) # by title - logger.info("\t\tDumping auth_{}_by_title.js".format(author.gut_id)) + logger.info(f"\t\tDumping auth_{author.gut_id}_by_title.js") dumpjs( [ book.to_array(all_requested_formats=formats) @@ -1196,11 +1172,11 @@ def dumpjs(col, fn, var="json_data"): Book.title.asc() ) ], - "auth_{}_by_title.js".format(author.gut_id), + f"auth_{author.gut_id}_by_title.js", ) # by language - for lang_name, lang, lang_count in avail_langs: - logger.info("\t\tDumping auth_{}_by_lang_{}.js".format(author.gut_id, lang)) + for _lang_name, lang, _lang_count in avail_langs: + logger.info(f"\t\tDumping auth_{author.gut_id}_by_lang_{lang}.js") dumpjs( [ book.to_array(all_requested_formats=formats) @@ -1208,7 +1184,7 @@ def dumpjs(col, fn, var="json_data"): .where(Book.author == author) .order_by(Book.downloads.desc()) ], - "auth_{}_lang_{}_by_popularity.js".format(author.gut_id, lang), + f"auth_{author.gut_id}_lang_{lang}_by_popularity.js", ) dumpjs( @@ -1218,11 +1194,11 @@ def dumpjs(col, fn, var="json_data"): .where(Book.author == author) .order_by(Book.title.asc()) ], - "auth_{}_lang_{}_by_title.js".format(author.gut_id, lang), + f"auth_{author.gut_id}_lang_{lang}_by_title.js", ) # author HTML redirect file - save_author_file(author, books, project_id, force=True) + save_author_file(author, books, project_id) # authors list sorted by name logger.info("\t\tDumping authors.js") diff --git a/gutenbergtozim/iso639.py b/src/gutenberg2zim/iso639.py similarity index 97% rename from gutenbergtozim/iso639.py rename to src/gutenberg2zim/iso639.py index 453d2c9..2fedbfa 100644 --- a/gutenbergtozim/iso639.py +++ b/src/gutenberg2zim/iso639.py @@ -1,13 +1,9 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import babel def language_name(code): try: - return babel.Locale(code).get_language_name(code).title() + return babel.Locale(code).get_language_name(code).title() # type: ignore except Exception: return other_language_names.get(code, code) @@ -26,7 +22,7 @@ def language_name(code): "yi": "ייִדיש", "ale": "Unangam Tunuu / Унáҥам Тунý", "ang": "Ænglisc", - "arp": "Hinónoʼeitíít", + "arp": "Hinóno`eitíít", "bgi": "Giangan", "ceb": "Cebuano", "csb": "kaszëbsczi", diff --git a/gutenbergtozim/l10n.py b/src/gutenberg2zim/l10n.py similarity index 96% rename from gutenbergtozim/l10n.py rename to src/gutenberg2zim/l10n.py index 1ac4290..59c176a 100644 --- a/gutenbergtozim/l10n.py +++ b/src/gutenberg2zim/l10n.py @@ -1,7 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - l10n_strings = { "default_locale": "en", "locales": { @@ -44,10 +40,10 @@ "table-previous": {"textContent": "Previous"}, "table-next": {"textContent": "Next"}, "about-1": { - "textContent": "Project Gutenberg offers over " "60,000 free ebooks" + "textContent": "Project Gutenberg offers over 60,000 free ebooks" }, "about-2": { - "textContent": "Choose among free epub books, " "free kindle books." + "textContent": "Choose among free epub books, free kindle books." }, "about-3": {"textContent": "Download them or read them online."}, "about-4": {"textContent": "We carry high quality ebooks"}, @@ -68,9 +64,9 @@ "fr": { "isocode": {"textContent": "fr"}, "autonym": {"textContent": "Français"}, - "homepage": {"alt": "Page d’accueil", "title": "Page d’accueil"}, + "homepage": {"alt": "Page d`accueil", "title": "Page d`accueil"}, "choose-language": {"placeholder": "Choisissez une langue..."}, - "ui-language-switcher": {"title": "Langue de l’interface"}, + "ui-language-switcher": {"title": "Langue de l`interface"}, "search": {"textContent": "Rechercher"}, "author": { "textContent": "Auteur", @@ -102,14 +98,14 @@ "license-pd": {"textContent": "Domaine public aux États-Unis."}, "nb-downloads": {"textContent": "{[ plural(nb) ]}"}, "nb-downloads[one]": { - "textContent": "1 téléchargement au cours des " "30 derniers jours." + "textContent": "1 téléchargement au cours des 30 derniers jours." }, "nb-downloads[other]": { "textContent": "{{nb}} téléchargements au cours des " "30 derniers jours." }, "top-title": {"textContent": "Bibliothèque du projet Gutenberg"}, - "sub-title": {"textContent": "Premier producteur d’ebooks gratuits"}, + "sub-title": {"textContent": "Premier producteur d`ebooks gratuits"}, "table-previous": {"textContent": "Prédédent"}, "table-next": {"textContent": "Suivant"}, "about-1": { @@ -117,11 +113,11 @@ "60 000 ebooks gratuits" }, "about-2": { - "textContent": "Choisissez parmi les livres epub ou " "kindle gratuits." + "textContent": "Choisissez parmi les livres epub ou kindle gratuits." }, "about-3": {"textContent": "Téléchargez-les ou lisez-les en ligne."}, "about-4": { - "textContent": "Nous attachons beaucoup d’importance " + "textContent": "Nous attachons beaucoup d`importance " "à la qualité des ebooks" }, "about-5": { @@ -131,7 +127,7 @@ }, "about-6": { "textContent": "Nous les numérisons et les relisons " - "soigneusement avec l’aide de milliers " + "soigneusement avec l`aide de milliers " "de volontaires." }, "bookshelves_title": {"textContent": "Rayons (A-Z)"}, @@ -250,7 +246,7 @@ "60,000 ebooks gratuiti" }, "about-2": { - "textContent": "Scegli fra file epub gratuiti, " "file kindle gratuiti." + "textContent": "Scegli fra file epub gratuiti, file kindle gratuiti." }, "about-3": {"textContent": "Scaricali o leggili online."}, "about-4": {"textContent": "Noi forniamo ebook di alta qualità"}, @@ -314,13 +310,11 @@ "textContent": "مشروع Gutenberg يوفر أكثر من" "60,000 كتاب ألكتروني مجاني" }, - "about-2": { - "textContent": "أختر من ضمن كتب ال epub, " "كتب kindle مجانية." - }, + "about-2": {"textContent": "أختر من ضمن كتب ال epub, كتب kindle مجانية."}, "about-3": {"textContent": "قم بتنزيلهم او قرائتهم عبر الأنترنت."}, "about-4": {"textContent": "نحن لدينا كتب ألكترونية عالية الجودة"}, "about-5": { - "textContent": "كل كتبنا كانت مسبقا " "منشورة من قبل ناشرين bona fide." + "textContent": "كل كتبنا كانت مسبقا منشورة من قبل ناشرين bona fide." }, "about-6": { "textContent": "لقد رقمناهم و قرأناهم " @@ -380,7 +374,7 @@ "60,000 gratis eboeken" }, "about-2": { - "textContent": "Kies uit gratis epub boeken, " "gratis kindle boeken." + "textContent": "Kies uit gratis epub boeken, gratis kindle boeken." }, "about-3": {"textContent": "Download ze of lees ze online."}, "about-4": {"textContent": "Wij bieden eboeken van hoge kwaliteit"}, @@ -589,7 +583,7 @@ "title": "Projekt Gutenberg-Bibliothek - Bücher auf Deutsch", }, "ell": { - "description": "Ο πρώτος παραγωγός δωρεάν ηλεκτρονικών βιβλίων", + "description": "O πρώτος παραγωγός δωρεάν ηλεκτρονικών βιβλίων", "title": "Project Gutenberg Library - Βιβλία στα Ελληνικά", }, "eng": { @@ -606,7 +600,7 @@ }, "est": { "description": "Esimene tasuta e-raamatute tootja", - "title": "Projekt Gutenbergi raamatukogu – eestikeelsed raamatud", + "title": "Projekt Gutenbergi raamatukogu - eestikeelsed raamatud", }, "fas": { "description": "اولین تولید کننده کتاب های الکترونیکی رایگان", @@ -629,7 +623,7 @@ "title": "Project Gutenberg Library - Books in Friulian", }, "gla": { - "description": "A ' chiad neach-dèanaidh leabhraichean-d a saor an " "asgaidh", + "description": "A ' chiad neach-dèanaidh leabhraichean-d a saor an asgaidh", "title": "Leabharlann Pròiseact Gutenberg - Leabhraichean ann an " "Gàidhlig na h-Alba", }, @@ -678,7 +672,7 @@ "title": "Project Gutenberg Library - Books in Latin", }, "mri": { - "description": "Ko te kaihanga tuatahi o nga pukapuka pukapuka kore " "utu", + "description": "Ko te kaihanga tuatahi o nga pukapuka pukapuka kore utu", "title": "Project Gutenberg Library - Books in Māori", }, "myn": { @@ -738,7 +732,7 @@ "title": "Biblioteca Proyecto Gutenberg - Libros en español", }, "srp": { - "description": "Први произвођач бесплатних е-књига", + "description": "Први произвођач бесплатних e-књига", "title": "Пројекат Библиотека Гутенберг - Књиге на српском", }, "swe": { diff --git a/gutenbergtozim/rdf.py b/src/gutenberg2zim/rdf.py similarity index 84% rename from gutenbergtozim/rdf.py rename to src/gutenberg2zim/rdf.py index d91996a..ef29ee8 100644 --- a/gutenbergtozim/rdf.py +++ b/src/gutenberg2zim/rdf.py @@ -1,7 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import os import pathlib import re @@ -10,9 +6,9 @@ import peewee from bs4 import BeautifulSoup -from gutenbergtozim import logger -from gutenbergtozim.database import Author, Book, BookFormat, License -from gutenbergtozim.utils import ( +from gutenberg2zim.constants import logger +from gutenberg2zim.database import Author, Book, BookFormat, License +from gutenberg2zim.utils import ( BAD_BOOKS_FORMATS, FORMAT_MATRIX, download_file, @@ -29,24 +25,27 @@ def get_rdf_fpath(): def download_rdf_file(rdf_path, rdf_url): """Download rdf-files archive""" if rdf_path.exists(): - logger.info("\trdf-files archive already exists in {}".format(rdf_path)) + logger.info(f"\trdf-files archive already exists in {rdf_path}") return - logger.info("\tDownloading {} into {}".format(rdf_url, rdf_path)) + logger.info(f"\tDownloading {rdf_url} into {rdf_path}") download_file(rdf_url, rdf_path) -def parse_and_fill(rdf_path, only_books=[], force=False): - logger.info("\tLooping throught RDF files in {}".format(rdf_path)) +def parse_and_fill(rdf_path, only_books): + logger.info(f"\tLooping throught RDF files in {rdf_path}") rdf_tarfile = tarfile.open(name=rdf_path, mode="r|bz2") for rdf_member in rdf_tarfile: - rdf_member_path = pathlib.Path(rdf_member.name) # skip books outside of requested list - if only_books and int(rdf_member_path.stem.replace("pg", "").replace(".rdf", "")) not in only_books: + if ( + only_books + and int(rdf_member_path.stem.replace("pg", "").replace(".rdf", "")) + not in only_books + ): continue if rdf_member_path.name == "pg0.rdf": @@ -55,28 +54,25 @@ def parse_and_fill(rdf_path, only_books=[], force=False): if not str(rdf_member_path.name).endswith(".rdf"): continue - parse_and_process_file(rdf_tarfile, rdf_member, force) - + parse_and_process_file(rdf_tarfile, rdf_member) -def parse_and_process_file(rdf_tarfile, rdf_member, force=False): - gid = re.match(r".*/pg([0-9]+).rdf", rdf_member.name).groups()[0] +def parse_and_process_file(rdf_tarfile, rdf_member): + gid = re.match(r".*/pg([0-9]+).rdf", rdf_member.name).groups()[0] # type: ignore if Book.get_or_none(id=int(gid)): logger.info( - "\tSkipping already parsed file {} for book id {}".format( - rdf_member.name, gid - ) + f"\tSkipping already parsed file {rdf_member.name} for book id {gid}" ) return - logger.info("\tParsing file {} for book id {}".format(rdf_member.name, gid)) + logger.info(f"\tParsing file {rdf_member.name} for book id {gid}") parser = RdfParser(rdf_tarfile.extractfile(rdf_member).read(), gid).parse() if parser.license == "None": - logger.info("\tWARN: Unusable book without any information {}".format(gid)) + logger.info(f"\tWARN: Unusable book without any information {gid}") elif parser.title == "": - logger.info("\tWARN: Unusable book without title {}".format(gid)) + logger.info(f"\tWARN: Unusable book without title {gid}") else: save_rdf_in_database(parser) @@ -109,7 +105,7 @@ def parse(self): # Parsing for the bookshelf name self.bookshelf = soup.find("pgterms:bookshelf") if self.bookshelf: - self.bookshelf = self.bookshelf.find("rdf:value").text + self.bookshelf = self.bookshelf.find("rdf:value").text # type: ignore # Search rdf to see if the image exists at the hard link # https://www.gutenberg.ord/cache/epub/id/pg{id}.cover.medium.jpg @@ -130,14 +126,14 @@ def parse(self): if self.author: self.author_id = self.author.find("pgterms:agent") self.author_id = ( - self.author_id.attrs["rdf:about"].split("/")[-1] + self.author_id.attrs["rdf:about"].split("/")[-1] # type: ignore if "rdf:about" in getattr(self.author_id, "attrs", "") else None ) if self.author.find("pgterms:name"): self.author_name = self.author.find("pgterms:name") - self.author_name = self.author_name.text.split(",") + self.author_name = self.author_name.text.split(",") # type: ignore if len(self.author_name) > 1: self.first_name = " ".join(self.author_name[::-2]).strip() @@ -154,15 +150,17 @@ def parse(self): self.death_year = get_formatted_number(self.death_year) # ISO 639-3 language codes that consist of 2 or 3 letters - self.language = soup.find("dcterms:language").find("rdf:value").text + self.language = ( + soup.find("dcterms:language").find("rdf:value").text # type: ignore + ) # The download count of the books on www.gutenberg.org. # This will be used to determine the popularity of the book. - self.downloads = soup.find("pgterms:downloads").text + self.downloads = soup.find("pgterms:downloads").text # type: ignore # The book might be licensed under GPL, public domain # or might be copyrighted - self.license = soup.find("dcterms:rights").text + self.license = soup.find("dcterms:rights").text # type: ignore # Finding out all the file types this book is available in file_types = soup.find_all("pgterms:file") @@ -177,7 +175,6 @@ def parse(self): def save_rdf_in_database(parser): - # Insert author, if it not exists if parser.author_id: try: @@ -218,7 +215,7 @@ def save_rdf_in_database(parser): try: book_record = Book.get(id=parser.gid) - except Book.DoesNotExist: + except peewee.DoesNotExist: book_record = Book.create( id=parser.gid, title=normalize(parser.title.strip()), @@ -250,7 +247,6 @@ def save_rdf_in_database(parser): # Insert formats for file_type in parser.file_types: - # Sanitize MIME mime = parser.file_types[file_type] if not mime.startswith("text/plain"): @@ -265,11 +261,9 @@ def save_rdf_in_database(parser): bid = int(book_record.id) if bid in BAD_BOOKS_FORMATS.keys() and mime in [ - FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid) + FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid) # type: ignore ]: - logger.error( - "\t**** EXCLUDING **** {} for book #{} from list.".format(mime, bid) - ) + logger.error(f"\t**** EXCLUDING **** {mime} for book #{bid} from list.") continue # Insert book format @@ -278,9 +272,10 @@ def save_rdf_in_database(parser): mime=mime, images=file_type.endswith(".images") or parser.file_types[file_type] == "application/pdf", - pattern=pattern, + pattern=pattern, ) + def get_formatted_number(num): """ Get a formatted string of a number from a not-predictable-string @@ -299,15 +294,15 @@ def get_formatted_number(num): if __name__ == "__main__": # Bacic Test with a sample rdf file - nums = ["{0:0=5d}".format(i) for i in range(21000, 40000)] + nums = [f"{i:0=5d}" for i in range(21000, 40000)] for num in nums: - print(num) + print(num) # noqa: T201 curd = os.path.dirname(os.path.realpath(__file__)) rdf = os.path.join(curd, "..", "rdf-files", num, "pg" + num + ".rdf") if os.path.isfile(rdf): data = "" - with open(rdf, "r") as f: + with open(rdf) as f: data = f.read() parser = RdfParser(data, num).parse() - print(parser.first_name, parser.last_name) + print(parser.first_name, parser.last_name) # noqa: T201 diff --git a/gutenbergtozim/s3.py b/src/gutenberg2zim/s3.py similarity index 93% rename from gutenbergtozim/s3.py rename to src/gutenberg2zim/s3.py index fbb3e7c..fad6b0c 100644 --- a/gutenbergtozim/s3.py +++ b/src/gutenberg2zim/s3.py @@ -1,7 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import os import pathlib import zipfile @@ -9,8 +5,8 @@ from kiwixstorage import KiwixStorage from pif import get_public_ip -from . import TMP_FOLDER, logger -from .utils import archive_name_for +from gutenberg2zim.constants import TMP_FOLDER, logger +from gutenberg2zim.utils import archive_name_for def s3_credentials_ok(s3_url_with_credentials): @@ -46,7 +42,7 @@ def download_from_cache( ): logger.error( f"optimizer version doesn't match for {key}. Expected " - + "{optimizer_version[book_format]}, got {meta.get('optimizer_version')}" + "{optimizer_version[book_format]}, got {meta.get('optimizer_version')}" ) return False dest_dir.mkdir(parents=True, exist_ok=True) diff --git a/gutenbergtozim/shared.py b/src/gutenberg2zim/shared.py similarity index 72% rename from gutenbergtozim/shared.py rename to src/gutenberg2zim/shared.py index c1c260f..3f38fed 100644 --- a/gutenbergtozim/shared.py +++ b/src/gutenberg2zim/shared.py @@ -1,17 +1,16 @@ import pathlib import threading from datetime import date -from typing import Any, Optional, Tuple, Union from zimscraperlib.zim.creator import Creator -from gutenbergtozim import VERSION, logger +from gutenberg2zim.constants import VERSION, logger class Global: """Shared context accross all scraper components""" - creator = None + creator: Creator _lock = threading.Lock() total = 0 @@ -41,27 +40,27 @@ def setup(filename, language, title, description, name): workaround_nocancel=False, title=title, description=description, - creator="gutenberg.org", - publisher="Kiwix", + creator="gutenberg.org", # type: ignore + publisher="Kiwix", # type: ignore name=name, - tags="_category:gutenberg;gutenberg", - scraper="gutengergtozim-{v}".format(v=VERSION), - date=date.today(), + tags="_category:gutenberg;gutenberg", # type: ignore + scraper=f"gutengergtozim-{VERSION}", # type: ignore + date=date.today(), # type: ignore ).config_verbose(True) @staticmethod def add_item_for( path: str, - title: Optional[str] = None, - fpath: Optional[pathlib.Path] = None, - content: Optional[bytes] = None, - mimetype: Optional[str] = None, - is_front: Optional[bool] = None, - should_compress: Optional[bool] = None, - delete_fpath: Optional[bool] = False, - callback: Optional[Union[callable, Tuple[callable, Any]]] = None, + title: str | None = None, + fpath: pathlib.Path | None = None, + content: bytes | None = None, + mimetype: str | None = None, + is_front: bool | None = None, + should_compress: bool | None = None, + *, + delete_fpath: bool | None = False, ): - logger.debug("\t\tAdding ZIM item at {}".format(path)) + logger.debug(f"\t\tAdding ZIM item at {path}") if not mimetype and path.endswith(".epub"): mimetype = "application/epub+zip" with Global._lock: @@ -74,7 +73,6 @@ def add_item_for( is_front=is_front, should_compress=should_compress, delete_fpath=delete_fpath, - callback=callback, ) @staticmethod diff --git a/gutenbergtozim/templates/Home.html b/src/gutenberg2zim/templates/Home.html similarity index 100% rename from gutenbergtozim/templates/Home.html rename to src/gutenberg2zim/templates/Home.html diff --git a/gutenbergtozim/templates/author.html b/src/gutenberg2zim/templates/author.html similarity index 100% rename from gutenbergtozim/templates/author.html rename to src/gutenberg2zim/templates/author.html diff --git a/gutenbergtozim/templates/base.html b/src/gutenberg2zim/templates/base.html similarity index 100% rename from gutenbergtozim/templates/base.html rename to src/gutenberg2zim/templates/base.html diff --git a/gutenbergtozim/templates/book_infobox.html b/src/gutenberg2zim/templates/book_infobox.html similarity index 100% rename from gutenbergtozim/templates/book_infobox.html rename to src/gutenberg2zim/templates/book_infobox.html diff --git a/gutenbergtozim/templates/bookshelf.html b/src/gutenberg2zim/templates/bookshelf.html similarity index 100% rename from gutenbergtozim/templates/bookshelf.html rename to src/gutenberg2zim/templates/bookshelf.html diff --git a/gutenbergtozim/templates/bookshelf_home.html b/src/gutenberg2zim/templates/bookshelf_home.html similarity index 100% rename from gutenbergtozim/templates/bookshelf_home.html rename to src/gutenberg2zim/templates/bookshelf_home.html diff --git a/gutenbergtozim/templates/cover_article.html b/src/gutenberg2zim/templates/cover_article.html similarity index 100% rename from gutenbergtozim/templates/cover_article.html rename to src/gutenberg2zim/templates/cover_article.html diff --git a/gutenbergtozim/templates/css/grids-responsive-min.css b/src/gutenberg2zim/templates/css/grids-responsive-min.css similarity index 100% rename from gutenbergtozim/templates/css/grids-responsive-min.css rename to src/gutenberg2zim/templates/css/grids-responsive-min.css diff --git a/gutenbergtozim/templates/css/logo_2.png b/src/gutenberg2zim/templates/css/logo_2.png similarity index 100% rename from gutenbergtozim/templates/css/logo_2.png rename to src/gutenberg2zim/templates/css/logo_2.png diff --git a/gutenbergtozim/templates/css/pure-min.css b/src/gutenberg2zim/templates/css/pure-min.css similarity index 100% rename from gutenbergtozim/templates/css/pure-min.css rename to src/gutenberg2zim/templates/css/pure-min.css diff --git a/gutenbergtozim/templates/css/pure-skin-gutenberg.css b/src/gutenberg2zim/templates/css/pure-skin-gutenberg.css similarity index 100% rename from gutenbergtozim/templates/css/pure-skin-gutenberg.css rename to src/gutenberg2zim/templates/css/pure-skin-gutenberg.css diff --git a/gutenbergtozim/templates/css/spinner.gif b/src/gutenberg2zim/templates/css/spinner.gif similarity index 100% rename from gutenbergtozim/templates/css/spinner.gif rename to src/gutenberg2zim/templates/css/spinner.gif diff --git a/gutenbergtozim/templates/css/style.css b/src/gutenberg2zim/templates/css/style.css similarity index 100% rename from gutenbergtozim/templates/css/style.css rename to src/gutenberg2zim/templates/css/style.css diff --git a/gutenbergtozim/templates/datatables/.gitignore b/src/gutenberg2zim/templates/datatables/.gitignore similarity index 100% rename from gutenbergtozim/templates/datatables/.gitignore rename to src/gutenberg2zim/templates/datatables/.gitignore diff --git a/gutenbergtozim/templates/favicon.ico b/src/gutenberg2zim/templates/favicon.ico similarity index 100% rename from gutenbergtozim/templates/favicon.ico rename to src/gutenberg2zim/templates/favicon.ico diff --git a/gutenbergtozim/templates/favicon.png b/src/gutenberg2zim/templates/favicon.png similarity index 100% rename from gutenbergtozim/templates/favicon.png rename to src/gutenberg2zim/templates/favicon.png diff --git a/gutenbergtozim/templates/fonts/Roboto-Black.ttf b/src/gutenberg2zim/templates/fonts/Roboto-Black.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/Roboto-Black.ttf rename to src/gutenberg2zim/templates/fonts/Roboto-Black.ttf diff --git a/gutenbergtozim/templates/fonts/Roboto-Bold.ttf b/src/gutenberg2zim/templates/fonts/Roboto-Bold.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/Roboto-Bold.ttf rename to src/gutenberg2zim/templates/fonts/Roboto-Bold.ttf diff --git a/gutenbergtozim/templates/fonts/Roboto-Light.ttf b/src/gutenberg2zim/templates/fonts/Roboto-Light.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/Roboto-Light.ttf rename to src/gutenberg2zim/templates/fonts/Roboto-Light.ttf diff --git a/gutenbergtozim/templates/fonts/Roboto-Medium.ttf b/src/gutenberg2zim/templates/fonts/Roboto-Medium.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/Roboto-Medium.ttf rename to src/gutenberg2zim/templates/fonts/Roboto-Medium.ttf diff --git a/gutenbergtozim/templates/fonts/Roboto-Regular.ttf b/src/gutenberg2zim/templates/fonts/Roboto-Regular.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/Roboto-Regular.ttf rename to src/gutenberg2zim/templates/fonts/Roboto-Regular.ttf diff --git a/gutenbergtozim/templates/fonts/Roboto-Thin.ttf b/src/gutenberg2zim/templates/fonts/Roboto-Thin.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/Roboto-Thin.ttf rename to src/gutenberg2zim/templates/fonts/Roboto-Thin.ttf diff --git a/gutenbergtozim/templates/fonts/RobotoCondensed-Bold.ttf b/src/gutenberg2zim/templates/fonts/RobotoCondensed-Bold.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/RobotoCondensed-Bold.ttf rename to src/gutenberg2zim/templates/fonts/RobotoCondensed-Bold.ttf diff --git a/gutenbergtozim/templates/fonts/RobotoCondensed-Light.ttf b/src/gutenberg2zim/templates/fonts/RobotoCondensed-Light.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/RobotoCondensed-Light.ttf rename to src/gutenberg2zim/templates/fonts/RobotoCondensed-Light.ttf diff --git a/gutenbergtozim/templates/fonts/RobotoCondensed-Regular.ttf b/src/gutenberg2zim/templates/fonts/RobotoCondensed-Regular.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/RobotoCondensed-Regular.ttf rename to src/gutenberg2zim/templates/fonts/RobotoCondensed-Regular.ttf diff --git a/gutenbergtozim/templates/fonts/font-awesome/css/font-awesome.css b/src/gutenberg2zim/templates/fonts/font-awesome/css/font-awesome.css similarity index 100% rename from gutenbergtozim/templates/fonts/font-awesome/css/font-awesome.css rename to src/gutenberg2zim/templates/fonts/font-awesome/css/font-awesome.css diff --git a/gutenbergtozim/templates/fonts/font-awesome/fonts/FontAwesome.otf b/src/gutenberg2zim/templates/fonts/font-awesome/fonts/FontAwesome.otf similarity index 100% rename from gutenbergtozim/templates/fonts/font-awesome/fonts/FontAwesome.otf rename to src/gutenberg2zim/templates/fonts/font-awesome/fonts/FontAwesome.otf diff --git a/gutenbergtozim/templates/fonts/font-awesome/fonts/fontawesome-webfont.eot b/src/gutenberg2zim/templates/fonts/font-awesome/fonts/fontawesome-webfont.eot similarity index 100% rename from gutenbergtozim/templates/fonts/font-awesome/fonts/fontawesome-webfont.eot rename to src/gutenberg2zim/templates/fonts/font-awesome/fonts/fontawesome-webfont.eot diff --git a/gutenbergtozim/templates/fonts/font-awesome/fonts/fontawesome-webfont.svg b/src/gutenberg2zim/templates/fonts/font-awesome/fonts/fontawesome-webfont.svg similarity index 100% rename from gutenbergtozim/templates/fonts/font-awesome/fonts/fontawesome-webfont.svg rename to src/gutenberg2zim/templates/fonts/font-awesome/fonts/fontawesome-webfont.svg diff --git a/gutenbergtozim/templates/fonts/font-awesome/fonts/fontawesome-webfont.ttf b/src/gutenberg2zim/templates/fonts/font-awesome/fonts/fontawesome-webfont.ttf similarity index 100% rename from gutenbergtozim/templates/fonts/font-awesome/fonts/fontawesome-webfont.ttf rename to src/gutenberg2zim/templates/fonts/font-awesome/fonts/fontawesome-webfont.ttf diff --git a/gutenbergtozim/templates/fonts/font-awesome/fonts/fontawesome-webfont.woff b/src/gutenberg2zim/templates/fonts/font-awesome/fonts/fontawesome-webfont.woff similarity index 100% rename from gutenbergtozim/templates/fonts/font-awesome/fonts/fontawesome-webfont.woff rename to src/gutenberg2zim/templates/fonts/font-awesome/fonts/fontawesome-webfont.woff diff --git a/gutenbergtozim/templates/jquery-ui/images/animated-overlay.gif b/src/gutenberg2zim/templates/jquery-ui/images/animated-overlay.gif similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/animated-overlay.gif rename to src/gutenberg2zim/templates/jquery-ui/images/animated-overlay.gif diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-bg_flat_75_aaaaaa_40x100.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-bg_flat_75_aaaaaa_40x100.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-bg_flat_75_aaaaaa_40x100.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-bg_flat_75_aaaaaa_40x100.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-bg_glass_100_f5f0e5_1x400.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-bg_glass_100_f5f0e5_1x400.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-bg_glass_100_f5f0e5_1x400.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-bg_glass_100_f5f0e5_1x400.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-bg_glass_25_cb842e_1x400.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-bg_glass_25_cb842e_1x400.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-bg_glass_25_cb842e_1x400.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-bg_glass_25_cb842e_1x400.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-bg_glass_70_ede4d4_1x400.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-bg_glass_70_ede4d4_1x400.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-bg_glass_70_ede4d4_1x400.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-bg_glass_70_ede4d4_1x400.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-bg_highlight-hard_100_f4f0ec_1x100.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-bg_highlight-hard_100_f4f0ec_1x100.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-bg_highlight-hard_100_f4f0ec_1x100.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-bg_highlight-hard_100_f4f0ec_1x100.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-bg_highlight-hard_65_fee4bd_1x100.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-bg_highlight-hard_65_fee4bd_1x100.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-bg_highlight-hard_65_fee4bd_1x100.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-bg_highlight-hard_65_fee4bd_1x100.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-bg_highlight-hard_75_f5f5b5_1x100.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-bg_highlight-hard_75_f5f5b5_1x100.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-bg_highlight-hard_75_f5f5b5_1x100.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-bg_highlight-hard_75_f5f5b5_1x100.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-bg_inset-soft_100_f4f0ec_1x100.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-bg_inset-soft_100_f4f0ec_1x100.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-bg_inset-soft_100_f4f0ec_1x100.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-bg_inset-soft_100_f4f0ec_1x100.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-icons_c47a23_256x240.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-icons_c47a23_256x240.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-icons_c47a23_256x240.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-icons_c47a23_256x240.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-icons_cb672b_256x240.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-icons_cb672b_256x240.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-icons_cb672b_256x240.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-icons_cb672b_256x240.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-icons_f08000_256x240.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-icons_f08000_256x240.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-icons_f08000_256x240.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-icons_f08000_256x240.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-icons_f35f07_256x240.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-icons_f35f07_256x240.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-icons_f35f07_256x240.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-icons_f35f07_256x240.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-icons_ff7519_256x240.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-icons_ff7519_256x240.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-icons_ff7519_256x240.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-icons_ff7519_256x240.png diff --git a/gutenbergtozim/templates/jquery-ui/images/ui-icons_ffffff_256x240.png b/src/gutenberg2zim/templates/jquery-ui/images/ui-icons_ffffff_256x240.png similarity index 100% rename from gutenbergtozim/templates/jquery-ui/images/ui-icons_ffffff_256x240.png rename to src/gutenberg2zim/templates/jquery-ui/images/ui-icons_ffffff_256x240.png diff --git a/gutenbergtozim/templates/jquery-ui/jquery-ui.css b/src/gutenberg2zim/templates/jquery-ui/jquery-ui.css similarity index 100% rename from gutenbergtozim/templates/jquery-ui/jquery-ui.css rename to src/gutenberg2zim/templates/jquery-ui/jquery-ui.css diff --git a/gutenbergtozim/templates/jquery-ui/jquery-ui.js b/src/gutenberg2zim/templates/jquery-ui/jquery-ui.js similarity index 100% rename from gutenbergtozim/templates/jquery-ui/jquery-ui.js rename to src/gutenberg2zim/templates/jquery-ui/jquery-ui.js diff --git a/gutenbergtozim/templates/jquery-ui/jquery-ui.min.css b/src/gutenberg2zim/templates/jquery-ui/jquery-ui.min.css similarity index 100% rename from gutenbergtozim/templates/jquery-ui/jquery-ui.min.css rename to src/gutenberg2zim/templates/jquery-ui/jquery-ui.min.css diff --git a/gutenbergtozim/templates/jquery-ui/jquery-ui.min.js b/src/gutenberg2zim/templates/jquery-ui/jquery-ui.min.js similarity index 100% rename from gutenbergtozim/templates/jquery-ui/jquery-ui.min.js rename to src/gutenberg2zim/templates/jquery-ui/jquery-ui.min.js diff --git a/gutenbergtozim/templates/jquery-ui/jquery-ui.structure.css b/src/gutenberg2zim/templates/jquery-ui/jquery-ui.structure.css similarity index 100% rename from gutenbergtozim/templates/jquery-ui/jquery-ui.structure.css rename to src/gutenberg2zim/templates/jquery-ui/jquery-ui.structure.css diff --git a/gutenbergtozim/templates/jquery-ui/jquery-ui.structure.min.css b/src/gutenberg2zim/templates/jquery-ui/jquery-ui.structure.min.css similarity index 100% rename from gutenbergtozim/templates/jquery-ui/jquery-ui.structure.min.css rename to src/gutenberg2zim/templates/jquery-ui/jquery-ui.structure.min.css diff --git a/gutenbergtozim/templates/jquery-ui/jquery-ui.theme.css b/src/gutenberg2zim/templates/jquery-ui/jquery-ui.theme.css similarity index 100% rename from gutenbergtozim/templates/jquery-ui/jquery-ui.theme.css rename to src/gutenberg2zim/templates/jquery-ui/jquery-ui.theme.css diff --git a/gutenbergtozim/templates/jquery-ui/jquery-ui.theme.min.css b/src/gutenberg2zim/templates/jquery-ui/jquery-ui.theme.min.css similarity index 100% rename from gutenbergtozim/templates/jquery-ui/jquery-ui.theme.min.css rename to src/gutenberg2zim/templates/jquery-ui/jquery-ui.theme.min.css diff --git a/gutenbergtozim/templates/jquery/jquery-1.11.1.min.js b/src/gutenberg2zim/templates/jquery/jquery-1.11.1.min.js similarity index 100% rename from gutenbergtozim/templates/jquery/jquery-1.11.1.min.js rename to src/gutenberg2zim/templates/jquery/jquery-1.11.1.min.js diff --git a/gutenbergtozim/templates/jquery/jquery.cookie.js b/src/gutenberg2zim/templates/jquery/jquery.cookie.js similarity index 100% rename from gutenbergtozim/templates/jquery/jquery.cookie.js rename to src/gutenberg2zim/templates/jquery/jquery.cookie.js diff --git a/gutenbergtozim/templates/jquery/jquery.persist.js b/src/gutenberg2zim/templates/jquery/jquery.persist.js similarity index 100% rename from gutenbergtozim/templates/jquery/jquery.persist.js rename to src/gutenberg2zim/templates/jquery/jquery.persist.js diff --git a/gutenbergtozim/templates/js/l10n.js b/src/gutenberg2zim/templates/js/l10n.js similarity index 100% rename from gutenbergtozim/templates/js/l10n.js rename to src/gutenberg2zim/templates/js/l10n.js diff --git a/gutenbergtozim/templates/js/tools.js b/src/gutenberg2zim/templates/js/tools.js similarity index 100% rename from gutenbergtozim/templates/js/tools.js rename to src/gutenberg2zim/templates/js/tools.js diff --git a/gutenbergtozim/urls.py b/src/gutenberg2zim/urls.py similarity index 87% rename from gutenbergtozim/urls.py rename to src/gutenberg2zim/urls.py index dc4fdeb..deada12 100644 --- a/gutenbergtozim/urls.py +++ b/src/gutenberg2zim/urls.py @@ -1,20 +1,10 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import os +import urllib.parse as urlparse from collections import defaultdict -from path import Path as path - -from gutenbergtozim import logger -from gutenbergtozim.database import Book, BookFormat, Url -from gutenbergtozim.utils import FORMAT_MATRIX, exec_cmd - -try: - import urlparse -except ImportError: - import urllib.parse as urlparse +from gutenberg2zim.constants import TMP_FOLDER_PATH, logger +from gutenberg2zim.database import Book, BookFormat, Url +from gutenberg2zim.utils import FORMAT_MATRIX, exec_cmd class UrlBuilder: @@ -50,7 +40,7 @@ def build(self): """ if self.base == self.BASE_ONE: - if int(self.b_id) > 10: + if int(self.b_id) > 10: # noqa: PLR2004 base_url = os.path.join( os.path.join(*list(str(self.b_id))[:-1]), str(self.b_id) ) @@ -61,7 +51,7 @@ def build(self): url = os.path.join(self.base, str(self.b_id)) elif self.base == self.BASE_THREE: url = self.base - return url + return url # type: ignore def with_base(self, base): self.base = base @@ -69,8 +59,8 @@ def with_base(self, base): def with_id(self, b_id): self.b_id = b_id - def __unicode__(self): - return self.build_url() + def __str__(self): + return self.build_url() # type: ignore def get_urls(book): @@ -227,7 +217,7 @@ def build_html(files): etext_nums = [] etext_nums.extend(range(90, 100)) etext_nums.extend(range(0, 6)) - etext_names = ["{0:0=2d}".format(i) for i in etext_nums] + etext_names = [f"{i:0=2d}" for i in etext_nums] etext_urls = [] for i in etext_names: etext_urls.append(os.path.join(u.build() + i, file_name)) @@ -237,11 +227,10 @@ def build_html(files): return list(set(urls)) -def setup_urls(force=False): - - file_with_url = os.path.join("tmp", "file_on_{}".format(UrlBuilder.SERVER_NAME)) +def setup_urls(force): + file_with_url = TMP_FOLDER_PATH.joinpath(f"file_on_{UrlBuilder.SERVER_NAME}") - if path(file_with_url).exists() and not force: + if file_with_url.exists() and not force: logger.info( "\tUrls rsync result {} already exists, processing existing file".format( file_with_url @@ -251,20 +240,20 @@ def setup_urls(force=False): cmd = [ "bash", "-c", - "rsync -a --list-only {} > {}".format(UrlBuilder.RSYNC, file_with_url), + f"rsync -a --list-only {UrlBuilder.RSYNC} > {file_with_url}", ] exec_cmd(cmd) logger.info("\tLooking after relative path start in urls rsync result") # search for "GUTINDEX*" file, so that we known where starts the relative # path in rsync output - with open(file_with_url, "r", errors="replace") as src: + with open(file_with_url, errors="replace") as src: for line in src.readlines(): start_rel_path_idx = line.find("GUTINDEX") if start_rel_path_idx >= 0: break - if start_rel_path_idx == -1: + if start_rel_path_idx == -1: # type: ignore raise ValueError("Unable to find relative path start in urls file") logger.info("\tRemoving all urls already present in DB") @@ -273,11 +262,11 @@ def setup_urls(force=False): logger.info("\tAppending urls in DB from rsync result") # strip rsync file to only contain relative path - with open(file_with_url, "r", errors="replace") as src: + with open(file_with_url, errors="replace") as src: for line in src.readlines(): - Url.create(url=line[start_rel_path_idx:].strip()) + Url.create(url=line[start_rel_path_idx:].strip()) # type: ignore if __name__ == "__main__": book = Book.get(id=9) - print(get_urls(book)) + print(get_urls(book)) # noqa: T201 diff --git a/gutenbergtozim/utils.py b/src/gutenberg2zim/utils.py similarity index 76% rename from gutenbergtozim/utils.py rename to src/gutenberg2zim/utils.py index 4d40099..21bbabe 100644 --- a/gutenbergtozim/utils.py +++ b/src/gutenberg2zim/utils.py @@ -1,7 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import collections import hashlib import os @@ -13,12 +9,12 @@ import chardet import requests import six -from path import Path as path +from path import Path from zimscraperlib.download import save_large_file -from gutenbergtozim import logger -from gutenbergtozim.database import Book, BookFormat -from gutenbergtozim.iso639 import language_name +from gutenberg2zim.constants import logger +from gutenberg2zim.database import Book, BookFormat +from gutenberg2zim.iso639 import language_name UTF8 = "utf-8" FORMAT_MATRIX = collections.OrderedDict( @@ -42,10 +38,10 @@ def book_name_for_fs(book): return book.title.strip().replace("/", "-")[:230] -def article_name_for(book, cover=False): +def article_name_for(book, *, cover=False): cover = "_cover" if cover else "" title = book_name_for_fs(book) - return "{title}{cover}.{id}.html".format(title=title, cover=cover, id=book.id) + return f"{title}{cover}.{book.id}.html" def archive_name_for(book, book_format): @@ -58,7 +54,9 @@ def fname_for(book, book_format): def get_etag_from_url(url): try: - response_headers = requests.head(url=url, allow_redirects=True).headers + response_headers = requests.head( # noqa: S113 + url=url, allow_redirects=True + ).headers except Exception as e: logger.error(url + " > Problem while head request\n" + str(e) + "\n") return None @@ -67,7 +65,7 @@ def get_etag_from_url(url): def critical_error(message): - logger.critical("ERROR: {}".format(message)) + logger.critical(f"ERROR: {message}") sys.exit(1) @@ -75,8 +73,7 @@ def normalize(text=None): return None if text is None else unicodedata.normalize("NFC", text) -def get_project_id(languages=[], formats=[], only_books=[]): - +def get_project_id(languages, formats, only_books): parts = ["gutenberg"] parts.append("mul" if len(languages) > 1 else languages[0]) if len(formats) < len(FORMAT_MATRIX): @@ -86,15 +83,12 @@ def get_project_id(languages=[], formats=[], only_books=[]): def exec_cmd(cmd): - if isinstance(cmd, (tuple, list)): + if isinstance(cmd, tuple | list): args = cmd else: args = cmd.split(" ") logger.debug(" ".join(args)) - if six.PY3: - return subprocess.run(args).returncode - else: - return subprocess.call(args) + return subprocess.run(args).returncode def download_file(url, fpath): @@ -120,7 +114,7 @@ def main_formats_for(book): return [k for k, v in FORMAT_MATRIX.items() if v in fmts] -def get_list_of_filtered_books(languages, formats, only_books=[]): +def get_list_of_filtered_books(languages, formats, only_books): if len(formats): qs = ( Book.select() @@ -166,14 +160,14 @@ def get_lang_groups(books): def md5sum(fpath): - return hashlib.md5(read_file(fpath)[0].encode("utf-8")).hexdigest() + return hashlib.md5(read_file(fpath)[0].encode("utf-8")).hexdigest() # noqa: S324 def is_bad_cover(fpath): bad_sizes = [19263] bad_sums = ["a059007e7a2e86f2bf92e4070b3e5c73"] - if path(fpath).size not in bad_sizes: + if Path(fpath).size not in bad_sizes: return False return md5sum(fpath) in bad_sums @@ -181,12 +175,8 @@ def is_bad_cover(fpath): def read_file_as(fpath, encoding="utf-8"): # logger.debug("opening `{}` as `{}`".format(fpath, encoding)) - if six.PY2: - with open(fpath, "r") as f: - return f.read().decode(encoding) - else: - with open(fpath, "r", encoding=encoding) as f: - return f.read() + with open(fpath, encoding=encoding) as f: + return f.read() def guess_file_encoding(fpath): @@ -203,16 +193,12 @@ def read_file(fpath): # common encoding failed. try with chardet encoding = guess_file_encoding(fpath) - return read_file_as(fpath, encoding), encoding + return read_file_as(fpath, encoding), encoding # type: ignore def save_file(content, fpath, encoding=UTF8): - if six.PY2: - with open(fpath, "w") as f: - f.write(content.encode(encoding)) - else: - with open(fpath, "w", encoding=encoding) as f: - f.write(content) + with open(fpath, "w", encoding=encoding) as f: + f.write(content) def zip_epub(epub_fpath, root_folder, fpaths): @@ -222,6 +208,4 @@ def zip_epub(epub_fpath, root_folder, fpaths): def ensure_unicode(v): - if six.PY2 and isinstance(v, str): - v = v.decode("utf8") return six.text_type(v) diff --git a/gutenbergtozim/zim.py b/src/gutenberg2zim/zim.py similarity index 71% rename from gutenbergtozim/zim.py rename to src/gutenberg2zim/zim.py index b44cdba..1e163e7 100644 --- a/gutenbergtozim/zim.py +++ b/src/gutenberg2zim/zim.py @@ -1,37 +1,33 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# vim: ai ts=4 sts=4 et sw=4 nu - import datetime -from path import Path as path +from path import Path from peewee import fn -from gutenbergtozim import logger -from gutenbergtozim.database import Book -from gutenbergtozim.export import export_all_books -from gutenbergtozim.iso639 import ISO_MATRIX -from gutenbergtozim.l10n import metadata_translations -from gutenbergtozim.shared import Global -from gutenbergtozim.utils import get_project_id +from gutenberg2zim.constants import logger +from gutenberg2zim.database import Book +from gutenberg2zim.export import export_all_books +from gutenberg2zim.iso639 import ISO_MATRIX +from gutenberg2zim.l10n import metadata_translations +from gutenberg2zim.shared import Global +from gutenberg2zim.utils import get_project_id def build_zimfile( output_folder, - download_cache=None, - concurrency=None, - languages=[], - formats=[], - only_books=[], - force=False, - title_search=False, - add_bookshelves=False, - s3_storage=None, - optimizer_version=None, - zim_name=None, - title=None, - description=None, - stats_filename=None, + download_cache, + concurrency, + languages, + formats, + only_books, + force, + title_search, + add_bookshelves, + s3_storage, + optimizer_version, + zim_name, + title, + description, + stats_filename, ): # actual list of languages with books sorted by most used nb = fn.COUNT(Book.language).alias("nb") @@ -57,22 +53,22 @@ def build_zimfile( description = description or metadata_translations.get(iso_languages[0], {}).get( "description", "The first producer of Free Ebooks" ) - logger.info("\tWritting ZIM for {}".format(title)) + logger.info(f"\tWritting ZIM for {title}") project_id = get_project_id(languages, formats, only_books) if zim_name is None: zim_name = "{}_{}.zim".format( - project_id, datetime.datetime.now().strftime("%Y-%m") + project_id, datetime.datetime.now().strftime("%Y-%m") # noqa: DTZ005 ) zim_path = output_folder.joinpath(zim_name) - if path(zim_name).exists() and not force: - logger.info("ZIM file `{}` already exist.".format(zim_name)) + if Path(zim_name).exists() and not force: + logger.info(f"ZIM file `{zim_name}` already exist.") return - elif path(zim_name).exists(): + elif Path(zim_name).exists(): logger.info(f"Removing existing ZIM file {zim_name}") - path(zim_name).unlink() + Path(zim_name).unlink() Global.setup( filename=zim_path, diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..90854e8 --- /dev/null +++ b/tasks.py @@ -0,0 +1,109 @@ +# pyright: strict, reportUntypedFunctionDecorator=false +import os + +from invoke.context import Context +from invoke.tasks import task # pyright: ignore [reportUnknownVariableType] + +use_pty = not os.getenv("CI", "") + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test(ctx: Context, args: str = ""): + """run tests (without coverage)""" + ctx.run(f"pytest {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test_cov(ctx: Context, args: str = ""): + """run test vith coverage""" + ctx.run(f"coverage run -m pytest {args}", pty=use_pty) + + +@task(optional=["html"], help={"html": "flag to export html report"}) +def report_cov(ctx: Context, *, html: bool = False): + """report coverage""" + ctx.run("coverage combine", warn=True, pty=use_pty) + ctx.run("coverage report --show-missing", pty=use_pty) + if html: + ctx.run("coverage html", pty=use_pty) + + +@task( + optional=["args", "html"], + help={ + "args": "pytest additional arguments", + "html": "flag to export html report", + }, +) +def coverage(ctx: Context, args: str = "", *, html: bool = False): + """run tests and report coverage""" + test_cov(ctx, args=args) + report_cov(ctx, html=html) + + +@task(optional=["args"], help={"args": "black additional arguments"}) +def lint_black(ctx: Context, args: str = "."): + args = args or "." # needed for hatch script + ctx.run("black --version", pty=use_pty) + ctx.run(f"black --check --diff {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "ruff additional arguments"}) +def lint_ruff(ctx: Context, args: str = "."): + args = args or "." # needed for hatch script + ctx.run("ruff --version", pty=use_pty) + ctx.run(f"ruff check {args}", pty=use_pty) + + +@task( + optional=["args"], + help={ + "args": "linting tools (black, ruff) additional arguments, typically a path", + }, +) +def lintall(ctx: Context, args: str = "."): + """Check linting""" + args = args or "." # needed for hatch script + lint_black(ctx, args) + lint_ruff(ctx, args) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def check_pyright(ctx: Context, args: str = ""): + """check static types with pyright""" + ctx.run("pyright --version") + ctx.run(f"pyright {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def checkall(ctx: Context, args: str = ""): + """check static types""" + check_pyright(ctx, args) + + +@task(optional=["args"], help={"args": "black additional arguments"}) +def fix_black(ctx: Context, args: str = "."): + """fix black formatting""" + args = args or "." # needed for hatch script + ctx.run(f"black {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "ruff additional arguments"}) +def fix_ruff(ctx: Context, args: str = "."): + """fix all ruff rules""" + args = args or "." # needed for hatch script + ctx.run(f"ruff --fix {args}", pty=use_pty) + + +@task( + optional=["args"], + help={ + "args": "linting tools (black, ruff) additional arguments, typically a path", + }, +) +def fixall(ctx: Context, args: str = "."): + """Fix everything automatically""" + args = args or "." # needed for hatch script + fix_black(ctx, args) + fix_ruff(ctx, args) + lintall(ctx, args) diff --git a/tests/test_dummy.py b/tests/test_dummy.py new file mode 100644 index 0000000..2efef7f --- /dev/null +++ b/tests/test_dummy.py @@ -0,0 +1,7 @@ +from gutenberg2zim.constants import TMP_FOLDER + + +# dummy test just to ensure that everything is ready to add real ones +# to be removed once a real test is added +def test_dummy(): + assert TMP_FOLDER == "tmp"