diff --git a/.github/workflows/Publish.yaml b/.github/workflows/Publish.yaml index b7e1859b..507a6314 100644 --- a/.github/workflows/Publish.yaml +++ b/.github/workflows/Publish.yaml @@ -1,20 +1,101 @@ -name: Build and upload to PyPI +name: Build and publish to PyPI on: release: types: [published] jobs: - publish: - runs-on: ubuntu-22.04 + generate-rules: + runs-on: ubuntu-24.04 + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[scripts] + + - name: Generate rules + run: | + python rules/generate_rules.py + + - name: Save rules artifact + uses: actions/upload-artifact@v4 + with: + path: | + src/zimscraperlib/rewriting/rules.py + tests/rewriting/test_fuzzy_rules.py + javascript/src/fuzzyRules.js + javascript/test/fuzzyRules.js + name: rules + retention-days: 1 + + build-js: + runs-on: ubuntu-24.04 + needs: generate-rules + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Restore rules artifact + uses: actions/download-artifact@v4 + with: + name: rules + + - name: Setup Node.JS + uses: actions/setup-node@v4 + with: + node-version-file: 'javascript/package.json' + + - name: Install JS dependencies + run: yarn install + working-directory: javascript + + - name: Build production JS + run: yarn build-prod + working-directory: javascript + + - name: Save wombat-setup artifact + uses: actions/upload-artifact@v4 + with: + path: javascript/dist/wombatSetup.js + name: wombat-setup + retention-days: 1 + + publish-python: + runs-on: ubuntu-24.04 + needs: + - generate-rules # to have proper Python rules files (src and tests) + - build-js # to have proper wombatSetup.js (needs to be included in sdist) permissions: - id-token: write # mandatory for PyPI trusted publishing + id-token: write # mandatory for PyPI trusted publishing steps: - - uses: actions/checkout@v3 + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Restore rules artifact + uses: actions/download-artifact@v4 + with: + name: rules + + - name: Restore wombat-setup artifact + uses: actions/download-artifact@v4 + with: + name: wombat-setup + path: src/zimscraperlib/rewriting/statics/wombatSetup.js - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version-file: pyproject.toml architecture: x64 @@ -24,5 +105,44 @@ jobs: pip install -U pip build python -m build --sdist --wheel - - name: Upload to PyPI + - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1.8 +# OPTIONAL PUBLICATION TO NPM, NOT NEEDED BY SCRAPERS IN THE END + +# publish-js: +# runs-on: ubuntu-24.04 +# needs: +# - generate-rules + +# steps: +# - name: Checkout repo +# uses: actions/checkout@v4 + +# - name: Restore rules artifact +# uses: actions/download-artifact@v4 +# with: +# name: rules + +# - name: Setup Node.JS +# uses: actions/setup-node@v4 +# with: +# node-version-file: 'javascript/package.json' +# registry-url: 'https://registry.npmjs.org' # Setup .npmrc file to publish to npm + +# - name: Install JS dependencies +# run: yarn install +# working-directory: javascript + +# - name: Build production JS +# run: yarn build-prod +# working-directory: javascript + +# - name: Build JS package +# run: yarn pack +# working-directory: javascript + +# - name: Publish to NPM +# run: npm publish $(ls *.tgz) --provenance --access public +# working-directory: javascript +# env: +# NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} diff --git a/.github/workflows/PublishDev.yaml b/.github/workflows/PublishDev.yaml new file mode 100644 index 00000000..8d3ef01b --- /dev/null +++ b/.github/workflows/PublishDev.yaml @@ -0,0 +1,47 @@ +name: Publish dev wombat-setup + +on: + push: + branches: + - main + +jobs: + publish-dev-wombat-setup: + runs-on: ubuntu-24.04 + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[scripts] + + - name: Generate rules + run: | + python rules/generate_rules.py + + - name: Setup Node.JS + uses: actions/setup-node@v4 + with: + node-version-file: 'javascript/package.json' + registry-url: 'https://registry.npmjs.org' + + - name: Install JS dependencies + run: yarn install + working-directory: javascript + + - name: Build production JS + run: yarn build-prod + working-directory: javascript + + - name: Upload wombatSetup.js to dev drive + run: | + curl -f -u "${{ secrets.DEV_DRIVE_WEBDAV_CREDENTIALS }}" -T javascript/dist/wombatSetup.js -sw '%{http_code}' "https://dev.kiwix.org/zimscraperlib/" diff --git a/.github/workflows/QA.yaml b/.github/workflows/QA.yaml index 48ccee5a..31064c2f 100644 --- a/.github/workflows/QA.yaml +++ b/.github/workflows/QA.yaml @@ -7,14 +7,54 @@ on: - main jobs: - check-qa: - runs-on: ubuntu-22.04 + generate-rules: + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v3 + - name: Checkout repo + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[scripts] + + - name: Generate rules + run: | + python rules/generate_rules.py + + - name: Save rules artifact + uses: actions/upload-artifact@v4 + with: + path: | + src/zimscraperlib/rewriting/rules.py + tests/rewriting/test_fuzzy_rules.py + javascript/src/fuzzyRules.js + javascript/test/fuzzyRules.js + name: rules + retention-days: 1 + + check-python-qa: + runs-on: ubuntu-24.04 + needs: generate-rules + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Restore rules artifact + uses: actions/download-artifact@v4 + with: + name: rules + + - name: Set up Python + uses: actions/setup-python@v5 with: python-version-file: pyproject.toml architecture: x64 @@ -32,3 +72,33 @@ jobs: - name: Check pyright run: inv check-pyright + + check-javascript-qa: + runs-on: ubuntu-24.04 + needs: generate-rules + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Restore rules artifact + uses: actions/download-artifact@v4 + with: + name: rules + + - name: Setup Node.JS + uses: actions/setup-node@v4 + with: + node-version-file: 'javascript/package.json' + + - name: Install JS dependencies + working-directory: javascript + run: yarn install + + - name: Check prettier formatting + working-directory: javascript + run: yarn prettier-check + + - name: Check eslint rules + working-directory: javascript + run: yarn eslint diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index 0fd2de44..66e647fc 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -7,23 +7,59 @@ on: - main jobs: - run-tests: - strategy: - matrix: - os: [ubuntu-22.04] - python: ["3.8", "3.9", "3.10", "3.11", "3.12"] - runs-on: ${{ matrix.os }} + generate-rules: + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v3 + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: pyproject.toml + architecture: x64 + + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[scripts] + + - name: Generate rules + run: | + python rules/generate_rules.py + + - name: Save rules artifact + uses: actions/upload-artifact@v4 + with: + path: | + src/zimscraperlib/rewriting/rules.py + tests/rewriting/test_fuzzy_rules.py + javascript/src/fuzzyRules.js + javascript/test/fuzzyRules.js + name: rules + retention-days: 1 + + run-python-tests: + runs-on: ubuntu-24.04 + needs: generate-rules + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Restore rules artifact + uses: actions/download-artifact@v4 + with: + name: rules - name: install ffmpeg and gifsicle run: sudo apt update && sudo apt install ffmpeg gifsicle - - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v4 + - name: Set up Python 3.12 + uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python }} + python-version-file: pyproject.toml architecture: x64 - name: Install dependencies (and project) @@ -35,24 +71,50 @@ jobs: run: inv coverage --args "--runslow --runinstalled -vvv" - name: Upload coverage report to codecov - if: matrix.python == '3.12' - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: fail_ci_if_error: true token: ${{ secrets.CODECOV_TOKEN }} - build_python: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version-file: pyproject.toml - architecture: x64 - - name: Ensure we can build Python targets run: | pip install -U pip build python3 -m build --sdist --wheel + + run-js-tests: + runs-on: ubuntu-24.04 + needs: generate-rules + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Restore rules artifact + uses: actions/download-artifact@v4 + with: + name: rules + + - name: Setup Node.JS + uses: actions/setup-node@v4 + with: + node-version-file: 'javascript/package.json' + + - name: Install JS dependencies + run: yarn install + working-directory: javascript + + - name: Run JS tests + working-directory: javascript + run: yarn test + + - name: Ensure we can build development JS + run: yarn build-dev + working-directory: javascript + + - name: Ensure we can build production JS + run: yarn build-prod + working-directory: javascript + + - name: Ensure we can build JS package + run: yarn pack + working-directory: javascript diff --git a/.gitignore b/.gitignore index 288bff6b..15586154 100644 --- a/.gitignore +++ b/.gitignore @@ -252,3 +252,16 @@ $RECYCLE.BIN/ # ignore all vscode, this is not standard configuration in this place .vscode src/libzim-stubs +javascript/node_modules + +# rule files are generated by rules/generate_rules.py +src/zimscraperlib/rewriting/rules.py +tests/rewriting/test_fuzzy_rules.py +javascript/src/fuzzyRules.js +javascript/test/fuzzyRules.js + +# wombatSetup.js is generated with rollup +src/zimscraperlib/rewriting/statics/wombatSetup.js + +# wombat.js is installed from online source +src/zimscraperlib/rewriting/statics/wombat.js diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e527d87f..8302a4ae 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,27 +2,34 @@ # See https://pre-commit.com/hooks.html for more hooks exclude: ^tests/files # these are raw test files, no need to mess with them repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer -- repo: https://github.com/psf/black - rev: "24.4.2" - hooks: - - id: black -- repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.9 - hooks: - - id: ruff -- repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.368 - hooks: - - id: pyright - name: pyright (system) - description: 'pyright static type checker' - entry: pyright - language: system - 'types_or': [python, pyi] - require_serial: true - minimum_pre_commit_version: '2.9.2' + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - repo: https://github.com/psf/black + rev: '24.10.0' + hooks: + - id: black + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.7.0 + hooks: + - id: ruff + - repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.385 + hooks: + - id: pyright + name: pyright (system) + description: 'pyright static type checker' + entry: pyright + language: system + 'types_or': [python, pyi] + require_serial: true + minimum_pre_commit_version: '2.9.2' + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 + hooks: + - id: prettier + args: + - --config + - javascript/.prettierrc.json diff --git a/README.md b/README.md index 835832d2..dcf4f1be 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -zimscraperlib -============= +# zimscraperlib [![Build Status](https://github.com/openzim/python-scraperlib/workflows/CI/badge.svg?query=branch%3Amain)](https://github.com/openzim/python-scraperlib/actions?query=branch%3Amain) [![CodeFactor](https://www.codefactor.io/repository/github/openzim/python-scraperlib/badge)](https://www.codefactor.io/repository/github/openzim/python-scraperlib) @@ -12,24 +11,26 @@ Collection of python code to re-use across python-based scrapers # Usage -* This library is meant to be installed via PyPI ([`zimscraperlib`](https://pypi.org/project/zimscraperlib/)). -* Make sure to reference it using a version code as the API is subject to frequent changes. -* API should remain the same only within the same *minor* version. +- This library is meant to be installed via PyPI ([`zimscraperlib`](https://pypi.org/project/zimscraperlib/)). +- Make sure to reference it using a version code as the API is subject to frequent changes. +- API should remain the same only within the same _minor_ version. Example usage: -``` pip +```pip zimscraperlib>=1.1,<1.2 ``` +See [functional architecture](docs/functional_architecture.md), [software architecture](docs/software_architecture.md) and [technical architecture](docs/technical_architecture.md) for more details on scraperlib (not all aspects are covered yet, this is a WIP). + # Dependencies -* libmagic -* wget -* libzim (auto-installed, not available on Windows) -* Pillow -* FFmpeg -* gifsicle (>=1.92) +- libmagic +- wget +- libzim (auto-installed, not available on Windows) +- Pillow +- FFmpeg +- gifsicle (>=1.92) ## macOS @@ -47,6 +48,7 @@ sudo apt install libmagic1 wget ffmpeg \ ``` ## Alpine + ``` apk add ffmpeg gifsicle libmagic wget libjpeg ``` @@ -69,15 +71,15 @@ invoke coverage Non-exhaustive list of scrapers using it (check status when updating API): -* [openzim/freecodecamp](https://github.com/openzim/freecodecamp) -* [openzim/gutenberg](https://github.com/openzim/gutenberg) -* [openzim/ifixit](https://github.com/openzim/ifixit) -* [openzim/kolibri](https://github.com/openzim/kolibri) -* [openzim/nautilus](https://github.com/openzim/nautilus) -* [openzim/nautilus](https://github.com/openzim/nautilus) -* [openzim/openedx](https://github.com/openzim/openedx) -* [openzim/sotoki](https://github.com/openzim/sotoki) -* [openzim/ted](https://github.com/openzim/ted) -* [openzim/warc2zim](https://github.com/openzim/warc2zim) -* [openzim/wikihow](https://github.com/openzim/wikihow) -* [openzim/youtube](https://github.com/openzim/youtube) +- [openzim/freecodecamp](https://github.com/openzim/freecodecamp) +- [openzim/gutenberg](https://github.com/openzim/gutenberg) +- [openzim/ifixit](https://github.com/openzim/ifixit) +- [openzim/kolibri](https://github.com/openzim/kolibri) +- [openzim/nautilus](https://github.com/openzim/nautilus) +- [openzim/nautilus](https://github.com/openzim/nautilus) +- [openzim/openedx](https://github.com/openzim/openedx) +- [openzim/sotoki](https://github.com/openzim/sotoki) +- [openzim/ted](https://github.com/openzim/ted) +- [openzim/warc2zim](https://github.com/openzim/warc2zim) +- [openzim/wikihow](https://github.com/openzim/wikihow) +- [openzim/youtube](https://github.com/openzim/youtube) diff --git a/docs/functional_architecture.md b/docs/functional_architecture.md new file mode 100644 index 00000000..c6f99564 --- /dev/null +++ b/docs/functional_architecture.md @@ -0,0 +1,92 @@ +# Functional Architecture + +## Enrich libzim functions + +zimscraperlib has primitives to enrich libzim functions with some operations which are known to be shared across scrapers. See `zim` module. + +## Handle videos + +zimscraperlib has primitives to manipulate videos with some operations which are known to be shared across scrapers. See `video` module. + +## Handle pictures + +zimscraperlib has primitives to manipulate pictures with some operations which are known to be shared across scrapers. See `image` module. + +## Store and rewrite mostly unmodified HTML, CSS and JS from online website + +zimscraperlib also contains primitives to rewrite HTML, CSS and JS fetched online, to proper operate within a ZIM without heavy modifications. While originaly developped for warc2zim, some of these primitives are now also used for mindtouch scraper and others might follow, so they are shared in zimscraperlib. See `rewriting` module. + +### ZIM storage + +While storing web resources in a ZIM is mostly straightforward (we just transfer the raw bytes, after some modification for URL rewriting if needed), the decision of the path where the resource will be stored is very important. + +This is purely conventional, even if ZIM specification has to be respected for proper operation in readers. + +This function is responsible to compute the ZIM path where a given web resource is going to be stored. + +While the URL is the only driver of this computation for now, zimscraperlib might have to consider other contextual data in the future. E.g. the resource to serve might by dynamic, depending not only on URL query parameters but also header(s) value(s). + +### Fuzzy rules + +Unfortunately, it is not always possible / desirable to store the resource with a simple transformation. + +A typical situation is that some query parameters are dynamically computed by some Javascript code to include user tracking identifier, current datetime information, ... + +When running again the same javascript code inside the ZIM, the URL will hence be slightly different because context has changed, but the same content needs to be retrieved. + +zimscraperlib hence relies on fuzzy rules to transform/simplify some URLs when computing the ZIM path. + +### URL Rewriting + +zimscraperlib transforms (rewrites) URLs found in documents (HTML, CSS, JS, ...) so that they are usable inside the ZIM. + +#### General case + +One simple example is that we might have following code in an HTML document to load an image with an absolute URL: + +``` + +``` + +The URL `https://en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` has to be transformed to a URL that it is usable inside the ZIM. + +For proper reader operation, openZIM prohibits using absolute URLs, so this has to be a relative URL. This relative URL is hence dependant on the location of the resource currently being rewriten. + +The table below gives some examples of what the rewritten URL is going to be, depending on the URL of the rewritten document. + +| HTML document URL | image URL rewritten for usage inside the ZIM | +| ------------------------------------- | ---------------------------------------------------- | +| `https://en.wikipedia.org/wiki/Kiwix` | `./File:Kiwix_logo_v3.svg` | +| `https://en.wikipedia.org/wiki` | `./wiki/File:Kiwix_logo_v3.svg` | +| `https://en.wikipedia.org/waka/Kiwix` | `../wiki/File:Kiwix_logo_v3.svg` | +| `https://fr.wikipedia.org/wiki/Kiwix` | `../../en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` | + +As can be seen on the last line (but this is true for all URLs), this rewriting has to take into account the convention saying at which ZIM path a given web resource will be stored. + +#### Dynamic case + +The explanation above more or less assumed that the transformations can be done statically, i.e zimscraperlib can open every known document, find existing URLs and replace them with their counterpart inside the ZIM. + +While this is possible for HTML and CSS documents typically, it is not possible when the URL is dynamically computed. This is typically the case for JS documents, where in the general case the URL is not statically stored inside the JS code but computed on-the-fly by aggregating various strings and values. + +Rewriting these computations is not deemed feasible due to the huge variety of situation which might be encountered. + +A specific function is hence needed to rewrite URL **live in client browser**, intercept any function triggering a web request, transform the URL according to conventions (where we expect the resource to be located in the general case) and fuzzy rules. + +_Spoiler: this is where we will rely on wombat.js from webrecorder team, since this dynamic interception is quite complex and already done quite neatly by them_ + +#### Fuzzy rules + +The same fuzzy rules that have been used to compute the ZIM path from a resource URL have to be applied again when rewriting URLs. + +While this is expected to serve mostly for the dynamic case, we still applies them on both side (staticaly and dynamicaly) for coherency. + +### Documents rewriten statically + +For now zimscraperlib rewrites HTML, CSS and JS documents. For CSS and JS, this mainly consists in replacing URLs. For HTML, we also have more specific rewritting necessary (e.g. to handle base href or redirects with meta). + +No domain specific (DS) rules are applied like it is done in wabac.JS because these rules are already applied in Browsertrix Crawler. For the same reason, JSON is not rewritten anymore (URL do not need to be rewritten in JSON because these URLs will be used by JS, intercepted by wombat and dynamically rewritten). + +JSONP callbacks are supposed to be rewritten but this has not been heavily tested. + +Other types of documents are supposed to be either not feasible / not worth it (e.g. URLs inside PDF documents), meaningless (e.g. images, fonts) or planned for later due to limited usage in the wild (e.g. XML). diff --git a/docs/software_architecture.md b/docs/software_architecture.md new file mode 100644 index 00000000..03dd49a2 --- /dev/null +++ b/docs/software_architecture.md @@ -0,0 +1,27 @@ +# Software architecture + +Currently only HTML, CSS and JS rewriting is described in this document. + +## HTML rewriting + +HTML rewriting is purely static (i.e. before resources are written to the ZIM). HTML code is parsed with the [HTML parser from Python standard library](https://docs.python.org/3/library/html.parser.html). + +A small header script is inserted in HTML code to initialize wombat.js which will wrap all JS APIs to dynamically rewrite URLs comming from JS. + +This header script is generated using [Jinja2](https://pypi.org/project/Jinja2/) template since it needs to populate some JS context variables needed by wombat.js operations (original scheme, original url, ...). + +## CSS rewriting + +CSS rewriting is purely static (i.e. before resources are written to the ZIM). CSS code is parsed with the [tinycss2 Python library](https://pypi.org/project/tinycss2/). + +## JS rewriting + +### Static + +Static JS rewriting is simply a matter of pure textual manipulation with regular expressions. No parsing is done at all. + +### Dynamic + +Dynamic JS rewriting is done with [wombat JS library](https://github.com/webrecorder/wombat). The same fuzzy rules that are used for static rewritting are injected into wombat configuration. Code to rewrite URLs is an adapted version of the code used to compute ZIM paths. + +For wombat setup, including the URL rewriting part, we need to pass wombat configuration info. This code is developed in the `javascript` folder. For URL parsing, it relies on the [uri-js library](https://www.npmjs.com/package/uri-js). This javascript code is bundled into a single `wombatSetup.js` file with [rollup bundler](https://rollupjs.org), the same bundler used by webrecorder team to bundle wombat. diff --git a/docs/technical_architecture.md b/docs/technical_architecture.md new file mode 100644 index 00000000..d3b9a394 --- /dev/null +++ b/docs/technical_architecture.md @@ -0,0 +1,56 @@ +# Technical architecture + +Currently only HTML, CSS and JS rewriting is described in this document. + +## Fuzzy rules + +Fuzzy rules are stored in `rules/rules.yaml`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code. + +Should you update these fuzzy rules, you hence have to: + +- regenerate Python and JS files by running `python rules/generateRules.py` +- bundle again Javascript `wombatSetup.js` (see below). + +## Wombat configuration + +Wombat configuration contains some static configuration and the dynamic URL rewriting, including fuzzy rules. + +It is bundled by rollup with `cd javascript && yarn build-prod` and the result is pushed to proper scraper location for inclusion at build time. + +Tests are available and run with `cd javascript && yarn test`. + +## Transformation of URL into ZIM path + +Transforming a URL into a ZIM path has to respect the ZIM specification: path must not be url-encoded (i.e. it must be decoded) and it must be stored as UTF-8. + +WARC record stores the items URL inside a header named "WARC-Target-URI". The value inside this header is encoded, or more exactly it is "exactly what the browser sent at the HTTP level" (see https://github.com/webrecorder/browsertrix-crawler/issues/492 for more details). + +It has been decided (by convention) that we will drop the scheme, the port, the username and password from the URL. Headers are also not considered in this computation. + +Computation of the ZIM path is hence mostly straightforward: + +- decode the hostname which is puny-encoded +- decode the path and query parameter which might be url-encoded + +## URL rewriting + +In addition to the computation of the relative path from the current document URL to the URL to rewrite, URL rewriting also consists in computing the proper ZIM path (with same operation as above) and properly encoding it so that the resulting URL respects [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986). Some important stuff has to be noted in this encoding. + +- since the original hostname is now part of the path, it will now be url-encoded +- since the `?` and following query parameters are also part of the path (we do not want readers to drop them like kiwix-serve would do), they are also url-encoded + +Below is an example case of the rewrite operation on an image URL found in an HTML document. + +- Document original URL: `https://kiwix.org/a/article/document.html` +- Document ZIM path: `kiwix.org/a/article/document.html` +- Image original URL: `//xn--exmple-cva.com/a/resource/image.png?foo=bar` +- Image rewritten URL: `../../../ex%C3%A9mple.com/a/resource/image.png%3Ffoo%3Dbar` +- Image ZIM Path: `exémple.com/a/resource/image.png?foo=bar` + +## JS Rewriting + +JS Rewriting is a bit special because rules to apply are different wether we are using "classic" Javascript or "module" Javascript. + +Detection of Javascript modules starts at the HTML level where we have a ` + + +{% endautoescape %} + + diff --git a/src/zimscraperlib/rewriting/url_rewriting.py b/src/zimscraperlib/rewriting/url_rewriting.py new file mode 100644 index 00000000..fbf0147d --- /dev/null +++ b/src/zimscraperlib/rewriting/url_rewriting.py @@ -0,0 +1,424 @@ +""" URL rewriting tools + +This module is about url and entry path rewriting. + +The global scheme is the following: + +Entries are stored in the ZIM file using their decoded fully decoded path: +- The full path is the full url without the scheme, username, password, port, fragment + (ie : "/(? None: + HttpUrl.check_validity(value) + self._value = value + + def __eq__(self, __value: object) -> bool: + return isinstance(__value, HttpUrl) and __value.value == self.value + + def __hash__(self) -> int: + return self.value.__hash__() + + def __str__(self) -> str: + return f"HttpUrl({self.value})" + + def __repr__(self) -> str: + return f"{self.__str__} - {super().__repr__()}" # pragma: no cover + + @property + def value(self) -> str: + return self._value + + @classmethod + def check_validity(cls, value: str) -> None: + parts = urlsplit(value) + + if parts.scheme.lower() not in ["http", "https"]: + raise ValueError( + f"Incorrect HttpUrl scheme in value: {value} {parts.scheme}" + ) + + if not parts.hostname: + raise ValueError(f"Unsupported empty hostname in value: {value}") + + if parts.hostname.lower() not in value: + raise ValueError(f"Unsupported upper-case chars in hostname : {value}") + + +class ZimPath: + """A utility class representing a ZIM path, usefull to pass this data around + + Includes a basic validation, ensuring that path does start with scheme, hostname,... + """ + + def __init__(self, value: str) -> None: + ZimPath.check_validity(value) + self._value = value + + def __eq__(self, __value: object) -> bool: + return isinstance(__value, ZimPath) and __value.value == self.value + + def __hash__(self) -> int: + return self.value.__hash__() + + def __str__(self) -> str: + return f"ZimPath({self.value})" + + def __repr__(self) -> str: + return f"{self.__str__} - {super().__repr__()}" # pragma: no cover + + @property + def value(self) -> str: + return self._value + + @classmethod + def check_validity(cls, value: str) -> None: + parts = urlsplit(value) + + if parts.scheme: + raise ValueError(f"Unexpected scheme in value: {value} {parts.scheme}") + + if parts.hostname: + raise ValueError(f"Unexpected hostname in value: {value} {parts.hostname}") + + if parts.username: + raise ValueError(f"Unexpected username in value: {value} {parts.username}") + + if parts.password: + raise ValueError(f"Unexpected password in value: {value} {parts.password}") + + +class ArticleUrlRewriter: + """ + Rewrite urls in article. + + This is typically used to rewrite urls found in an HTML document, but can be used + beyong that usage. + """ + + additional_rules: ClassVar[list[AdditionalRule]] = COMPILED_FUZZY_RULES + + def __init__( + self, + *, + article_url: HttpUrl, + article_path: ZimPath | None = None, + existing_zim_paths: set[ZimPath] | None = None, + missing_zim_paths: set[ZimPath] | None = None, + ): + """ + Initialise the rewriter + + Args: + article_url: URL where the original document was located, used to resolve + relative URLS which will be passed + existing_zim_paths: list of ZIM paths which are known to exist, useful if one + wants to rewrite the URL to a local one only if item exists in the ZIM + missing_zim_paths: list of ZIM paths which are known to already be missing + from the existing_zim_paths ; usefull only in complement with this variable ; + new missing entries will be added as URLs are normalized in this function + + Results: + items_to_download: populated with the list of rewritten URLs, so that one + might use it to download items after rewriting the document + """ + self.article_path = article_path or ArticleUrlRewriter.normalize(article_url) + self.article_url = article_url + self.existing_zim_paths = existing_zim_paths + self.missing_zim_paths = missing_zim_paths + self.items_to_download: dict[ZimPath, HttpUrl] = {} + + def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath: + """Utility to transform an item URL into a ZimPath""" + + item_absolute_url = urljoin( + urljoin(self.article_url.value, base_href), item_url + ) + return ArticleUrlRewriter.normalize(HttpUrl(item_absolute_url)) + + def __call__( + self, + item_url: str, + base_href: str | None, + *, + rewrite_all_url: bool = True, + ) -> str: + """Rewrite a url contained in a article. + + The url is "fully" rewrited to point to a normalized entry path + """ + + try: + item_url = item_url.strip() + + # Make case of standalone fragments more straightforward + if item_url.startswith("#"): + return item_url + + item_scheme = urlsplit(item_url).scheme + if item_scheme and item_scheme not in ("http", "https"): + return item_url + + item_absolute_url = urljoin( + urljoin(self.article_url.value, base_href), item_url + ) + + item_fragment = urlsplit(item_absolute_url).fragment + + item_path = ArticleUrlRewriter.normalize(HttpUrl(item_absolute_url)) + + if rewrite_all_url or ( + self.existing_zim_paths and item_path in self.existing_zim_paths + ): + if item_path not in self.items_to_download: + self.items_to_download[item_path] = HttpUrl(item_absolute_url) + return self.get_document_uri(item_path, item_fragment) + else: + if ( + self.missing_zim_paths is not None + and item_path not in self.missing_zim_paths + ): + logger.debug(f"WARNING {item_path} ({item_url}) not in archive.") + # maintain a collection of missing Zim Path to not fill the logs + # with duplicate messages + self.missing_zim_paths.add(item_path) + # The url doesn't point to a known entry + return item_absolute_url + + except Exception as exc: # pragma: no cover + item_scheme = ( + item_scheme # pyright: ignore[reportPossiblyUnboundVariable] + if "item_scheme" in locals() + else "" + ) + item_absolute_url = ( + item_absolute_url # pyright: ignore[reportPossiblyUnboundVariable] + if "item_absolute_url" in locals() + else "" + ) + item_fragment = ( + item_fragment # pyright: ignore[reportPossiblyUnboundVariable] + if "item_fragment" in locals() + else "" + ) + item_path = ( + item_path # pyright: ignore[reportPossiblyUnboundVariable] + if "item_path" in locals() + else "" + ) + logger.debug( + f"Invalid URL value found in {self.article_url.value}, kept as-is. " + f"(item_url: {item_url}, " + f"item_scheme: {item_scheme}, " + f"item_absolute_url: {item_absolute_url}, " + f"item_fragment: {item_fragment}, " + f"item_path: {item_path}, " + f"rewrite_all_url: {rewrite_all_url}", + exc_info=exc, + ) + return item_url + + def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str: + """Given an ZIM item path and its fragment, get the URI to use in document + + This function transforms the path of a ZIM item we want to adress from current + document (HTML / JS / ...) and returns the corresponding URI to use. + + It computes the relative path based on current document location and escape + everything which needs to be to transform the ZIM path into a valid RFC 3986 URI + + It also append a potential trailing item fragment at the end of the resulting + URI. + + """ + item_parts = urlsplit(item_path.value) + + # item_path is both path + querystring, both will be url-encoded in the document + # so that readers consider them as a whole and properly pass them to libzim + item_url = item_parts.path + if item_parts.query: + item_url += "?" + item_parts.query + relative_path = str( + PurePosixPath(item_url).relative_to( + ( + PurePosixPath(self.article_path.value) + if self.article_path.value.endswith("/") + else PurePosixPath(self.article_path.value).parent + ), + walk_up=True, + ) + ) + # relative_to removes a potential last '/' in the path, we add it back + if item_path.value.endswith("/"): + relative_path += "/" + + return ( + f"{quote(relative_path, safe='/')}" + f"{'#' + item_fragment if item_fragment else ''}" + ) + + @classmethod + def apply_additional_rules(cls, uri: HttpUrl | str) -> str: + """Apply additional rules on a URL or relative path + + First matching additional rule matching the input value is applied and its + result is returned. + + If no additional rule is matching, the input is returned as-is. + """ + value = uri.value if isinstance(uri, HttpUrl) else uri + for rule in cls.additional_rules: + if match := rule.match.match(value): + return match.expand(rule.replace) + return value + + @classmethod + def normalize(cls, url: HttpUrl) -> ZimPath: + """Transform a HTTP URL into a ZIM path to use as a entry's key. + + According to RFC 3986, a URL allows only a very limited set of characters, so we + assume by default that the url is encoded to match this specification. + + The transformation rewrites the hostname, the path and the querystring. + + The transformation drops the URL scheme, username, password, port and fragment: + - we suppose there is no conflict of URL scheme or port: there is no two + ressources with same hostname, path and querystring but different URL scheme or + port leading to different content + - we consider username/password port are purely authentication mechanism which + have no impact on the content to server + - we know that the fragment is never passed to the server, it stays in the + User-Agent, so if we encounter a fragment while normalizing a URL found in a + document, it won't make its way to the ZIM anyway and will stay client-side + + The transformation consists mainly in decoding the three components so that ZIM + path is not encoded at all, as required by the ZIM specification. + + Decoding is done differently for the hostname (decoded with puny encoding) and + the path and querystring (both decoded with url decoding). + + The final transformation is the application of fuzzy rules (sourced from wabac) + to transform some URLs into replay URLs and drop some useless stuff. + + Returned value is a ZIM path, without any puny/url encoding applied, ready to be + passed to python-libzim for UTF-8 encoding. + """ + + if not isinstance(url, HttpUrl): + raise ValueError("Bad argument type passed, HttpUrl expected") + + url_parts = urlsplit(url.value) + + if not url_parts.hostname: + # cannot happen because of the HttpUrl checks, but important to please the + # type checker + raise Exception("Hostname is missing") # pragma: no cover + + # decode the hostname if it is punny-encoded + hostname = ( + idna.decode(url_parts.hostname) + if url_parts.hostname.startswith("xn--") + else url_parts.hostname + ) + + path = url_parts.path + + if path: + # unquote the path so that it is stored unencoded in the ZIM as required by + # ZIM specification + path = unquote(path) + else: + # if path is empty, we need a "/" to remove ambiguities, e.g. + # https://example.com and https://example.com/ must all lead to the same ZIM + # entry to match RFC 3986 section 6.2.3: + # https://www.rfc-editor.org/rfc/rfc3986#section-6.2.3 + path = "/" + + query = url_parts.query + + # if query is missing, we do not add it at all, not even a trailing ? without + # anything after it + if url_parts.query: + # `+`` in query parameter must be decoded as space first to remove + # ambiguities between a space (encoded as `+` in url query parameter) and a + # real plus sign (encoded as %2B but soon decoded in ZIM path) + query = query.replace("+", " ") + # unquote the query so that it is stored unencoded in the ZIM as required by + # ZIM specification + query = "?" + unquote(query) + else: + query = "" + + fuzzified_url = ArticleUrlRewriter.apply_additional_rules( + f"{hostname}{ArticleUrlRewriter._remove_subsequent_slashes(path)}{ArticleUrlRewriter._remove_subsequent_slashes(query)}" + ) + + return ZimPath(fuzzified_url) + + @classmethod + def _remove_subsequent_slashes(cls, value: str) -> str: + """Remove all successive occurence of a slash `/` in a given string + + E.g `val//ue` or `val///ue` or `val////ue` (and so on) are transformed into + `value` + """ + return re.sub(r"//+", "/", value) diff --git a/src/zimscraperlib/zim/_libkiwix.py b/src/zimscraperlib/zim/_libkiwix.py index c20357c8..02cae889 100644 --- a/src/zimscraperlib/zim/_libkiwix.py +++ b/src/zimscraperlib/zim/_libkiwix.py @@ -16,10 +16,9 @@ import io from collections import namedtuple -from typing import Dict MimetypeAndCounter = namedtuple("MimetypeAndCounter", ["mimetype", "value"]) -CounterMap = Dict[ +CounterMap = dict[ type(MimetypeAndCounter.mimetype), type(MimetypeAndCounter.value) # pyright: ignore ] diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index 0dab5029..a4558b8d 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -264,7 +264,7 @@ def convert_and_check_metadata( Also checks that final type is appropriate for libzim (str or bytes) """ - if name == "Date" and isinstance(value, (datetime.date, datetime.datetime)): + if name == "Date" and isinstance(value, datetime.date | datetime.datetime): value = value.strftime("%Y-%m-%d") if ( name == "Tags" diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py index a7625b07..e1f9e9b2 100644 --- a/src/zimscraperlib/zim/items.py +++ b/src/zimscraperlib/zim/items.py @@ -129,7 +129,7 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider: # content was set manually content = getattr(self, "content", None) if content is not None: - if not isinstance(content, (str, bytes)): + if not isinstance(content, str | bytes): raise AttributeError(f"Unexpected type for content: {type(content)}") return StringProvider(content=content, ref=self) @@ -155,7 +155,7 @@ def _get_auto_index(self): # content was set manually content = getattr(self, "content", None) if content is not None: - if not isinstance(content, (str, bytes)): + if not isinstance(content, str | bytes): raise RuntimeError( f"Unexpected type for content: {type(content)}" ) # pragma: no cover diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index 3db12c7b..411e42f6 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -60,7 +60,7 @@ def validate_title(name: str, value: str): def validate_date(name: str, value: datetime.datetime | datetime.date | str): """ensures Date metadata can be casted to an ISO 8601 string""" if name == "Date": - if not isinstance(value, (datetime.datetime, datetime.date, str)): + if not isinstance(value, datetime.datetime | datetime.date | str): raise ValueError(f"Invalid type for {name}: {type(value)}") elif isinstance(value, str): match = re.match(r"(?P\d{4})-(?P\d{2})-(?P\d{2})", value) diff --git a/src/zimscraperlib/zim/providers.py b/src/zimscraperlib/zim/providers.py index 2c384ddb..a4748cbb 100644 --- a/src/zimscraperlib/zim/providers.py +++ b/src/zimscraperlib/zim/providers.py @@ -13,7 +13,7 @@ import io import pathlib -from typing import Generator +from collections.abc import Generator import libzim.writer # pyright: ignore import requests diff --git a/tests/rewriting/__init__.py b/tests/rewriting/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/rewriting/conftest.py b/tests/rewriting/conftest.py new file mode 100644 index 00000000..390dd471 --- /dev/null +++ b/tests/rewriting/conftest.py @@ -0,0 +1,100 @@ +from collections.abc import Callable, Generator + +import pytest + +from zimscraperlib.rewriting.css import CssRewriter +from zimscraperlib.rewriting.js import JsRewriter +from zimscraperlib.rewriting.url_rewriting import ( + ArticleUrlRewriter, + HttpUrl, + ZimPath, +) + + +@pytest.fixture(scope="module") +def no_js_notify(): + """Fixture to not care about notification of detection of a JS file""" + + def no_js_notify_handler(_: str): + pass + + yield no_js_notify_handler + + +class SimpleUrlRewriter(ArticleUrlRewriter): + """Basic URL rewriter mocking most calls""" + + def __init__(self, article_url: HttpUrl, suffix: str = ""): + self.article_url = article_url + self.suffix = suffix + + def __call__( + self, + item_url: str, + base_href: str | None, # noqa: ARG002 + *, + rewrite_all_url: bool = True, # noqa: ARG002 + ) -> str: + return item_url + self.suffix + + def get_item_path( + self, item_url: str, base_href: str | None # noqa: ARG002 + ) -> ZimPath: + return ZimPath("") + + def get_document_uri( + self, item_path: ZimPath, item_fragment: str # noqa: ARG002 + ) -> str: + return "" + + +@pytest.fixture(scope="module") +def simple_url_rewriter_gen() -> ( + Generator[Callable[[str], ArticleUrlRewriter], None, None] +): + """Fixture to create a basic url rewriter returning URLs as-is""" + + def get_simple_url_rewriter(url: str, suffix: str = "") -> ArticleUrlRewriter: + return SimpleUrlRewriter(HttpUrl(url), suffix=suffix) + + yield get_simple_url_rewriter + + +@pytest.fixture(scope="module") +def js_rewriter_gen() -> Generator[ + Callable[[ArticleUrlRewriter, str | None, Callable[[ZimPath], None]], JsRewriter], + None, + None, +]: + """Fixture to create a basic url rewriter returning URLs as-is""" + + def get_js_rewriter( + url_rewriter: ArticleUrlRewriter, + base_href: str | None, + notify_js_module: Callable[[ZimPath], None], + ) -> JsRewriter: + return JsRewriter( + url_rewriter=url_rewriter, + base_href=base_href, + notify_js_module=notify_js_module, + ) + + yield get_js_rewriter + + +@pytest.fixture(scope="module") +def css_rewriter_gen() -> ( + Generator[Callable[[ArticleUrlRewriter, str | None], CssRewriter], None, None] +): + """Fixture to create a basic url rewriter returning URLs as-is""" + + def get_css_rewriter( + url_rewriter: ArticleUrlRewriter, + base_href: str | None, + ) -> CssRewriter: + return CssRewriter( + url_rewriter=url_rewriter, + base_href=base_href, + ) + + yield get_css_rewriter diff --git a/tests/rewriting/test_css_rewriting.py b/tests/rewriting/test_css_rewriting.py new file mode 100644 index 00000000..a43ce849 --- /dev/null +++ b/tests/rewriting/test_css_rewriting.py @@ -0,0 +1,218 @@ +from textwrap import dedent + +import pytest + +from zimscraperlib.rewriting.css import CssRewriter +from zimscraperlib.rewriting.url_rewriting import ArticleUrlRewriter, HttpUrl + +from .utils import ContentForTests + + +@pytest.fixture( + params=[ + ContentForTests(input_=b"p { color: red; }"), + ContentForTests(input_=b"p {\n color: red;\n}"), + ContentForTests(input_=b"p { background: blue; }"), + ContentForTests(input_=b"p { background: rgb(15, 0, 52); }"), + ContentForTests( + input_=b"/* See bug issue at http://exemple.com/issue/link */ " + b"p { color: blue; }" + ), + ContentForTests( + input_=b"p { width= } div { background: url(http://exemple.com/img.png)}", + expected=b"p { width= } div { background: url(../exemple.com/img.png)}", + ), + ContentForTests( + input_=b"p { width= } div { background: url('http://exemple.com/img.png')}", + expected=b'p { width= } div { background: url("../exemple.com/img.png")}', + ), + ContentForTests( + input_=b'p { width= } div { background: url("http://exemple.com/img.png")}', + expected=b'p { width= } div { background: url("../exemple.com/img.png")}', + ), + ] +) +def no_rewrite_content(request: pytest.FixtureRequest): + yield request.param + + +def test_no_rewrite(no_rewrite_content: ContentForTests): + assert ( + CssRewriter( + ArticleUrlRewriter( + article_url=HttpUrl(f"http://{no_rewrite_content.article_url}") + ), + base_href=None, + ).rewrite(no_rewrite_content.input_bytes) + == no_rewrite_content.expected_bytes.decode() + ) + + +def test_no_rewrite_str(): + test_css = "p {\n color: red;\n}" + assert ( + CssRewriter( + ArticleUrlRewriter(article_url=HttpUrl("http://kiwix.org")), + base_href=None, + ).rewrite(test_css) + == test_css + ) + + +@pytest.fixture( + params=[ + ContentForTests(input_='"border:'), + ContentForTests(input_="border: solid 1px #c0c0c0; width= 100%"), + # Despite being invalid, tinycss parse it as "width" property without value. + ContentForTests(input_="width:", expected="width:;"), + ContentForTests( + input_="border-bottom-width: 1px;border-bottom-color: #c0c0c0;w" + ), + ContentForTests( + input_='background: url("http://exemple.com/foo.png"); width=', + expected='background: url("../exemple.com/foo.png"); width=', + ), + ] +) +def invalid_content_inline_with_fallback(request: pytest.FixtureRequest): + yield request.param + + +def test_invalid_css_inline_with_fallback( + invalid_content_inline_with_fallback: ContentForTests, +): + assert ( + CssRewriter( + ArticleUrlRewriter( + article_url=HttpUrl( + f"http://{invalid_content_inline_with_fallback.article_url}" + ) + ), + base_href=None, + ).rewrite_inline(invalid_content_inline_with_fallback.input_str) + == invalid_content_inline_with_fallback.expected_str + ) + + +@pytest.fixture( + params=[ + ContentForTests(input_='"border:', expected=""), + ContentForTests( + input_="border: solid 1px #c0c0c0; width= 100%", + expected="border: solid 1px #c0c0c0; ", + ), + # Despite being invalid, tinycss parse it as "width" property without value. + ContentForTests(input_="width:", expected="width:;"), + ContentForTests( + input_="border-bottom-width: 1px;border-bottom-color: #c0c0c0;w", + expected="border-bottom-width: 1px;border-bottom-color: #c0c0c0;", + ), + ContentForTests( + input_='background: url("http://exemple.com/foo.png"); width=', + expected='background: url("../exemple.com/foo.png"); ', + ), + ] +) +def invalid_content_inline_no_fallback(request: pytest.FixtureRequest): + yield request.param + + +def test_invalid_css_inline_no_fallback( + invalid_content_inline_no_fallback: ContentForTests, +): + assert ( + CssRewriter( + ArticleUrlRewriter( + article_url=HttpUrl( + f"http://{invalid_content_inline_no_fallback.article_url}" + ) + ), + base_href=None, + remove_errors=True, + ).rewrite_inline(invalid_content_inline_no_fallback.input_str) + == invalid_content_inline_no_fallback.expected_str + ) + + +@pytest.fixture( + params=[ + # Tinycss parse `"border:}` as a string with an unexpected eof in string. + # At serialization, tiny try to recover and close the opened rule + ContentForTests(input_=b'p {"border:}', expected=b'p {"border:}}'), + ContentForTests(input_=b'"p {border:}'), + ContentForTests(input_=b"p { border: solid 1px #c0c0c0; width= 100% }"), + ContentForTests(input_=b"p { width: }"), + ContentForTests( + input_=b"p { border-bottom-width: 1px;border-bottom-color: #c0c0c0;w }" + ), + ContentForTests( + input_=b'p { background: url("http://exemple.com/foo.png"); width= }', + expected=b'p { background: url("../exemple.com/foo.png"); width= }', + ), + ] +) +def invalid_content(request: pytest.FixtureRequest): + yield request.param + + +def test_invalid_cssl(invalid_content: ContentForTests): + assert ( + CssRewriter( + ArticleUrlRewriter( + article_url=HttpUrl(f"http://{invalid_content.article_url}") + ), + base_href=None, + ).rewrite(invalid_content.input_bytes) + == invalid_content.expected_bytes.decode() + ) + + +def test_rewrite(): + content = b""" +/* A comment with a link : http://foo.com */ +@import url(//fonts.googleapis.com/icon?family=Material+Icons); + +p, input { + color: rbg(1, 2, 3); + background: url('http://kiwix.org/super/img'); + background-image:url('http://exemple.com/no_space_before_url'); +} + +@font-face { + src: url(https://f.gst.com/s/qa/v31/6xKtdSZaE8KbpRA_hJFQNcOM.woff2) format('woff2'); +} + +@media only screen and (max-width: 40em) { + p, input { + background-image:url(); + } +}""" + + expected = """ + /* A comment with a link : http://foo.com */ + @import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%20Icons); + + p, input { + color: rbg(1, 2, 3); + background: url("super/img"); + background-image:url("../exemple.com/no_space_before_url"); + } + + @font-face { + src: url(../f.gst.com/s/qa/v31/6xKtdSZaE8KbpRA_hJFQNcOM.woff2) format("woff2"); + } + + @media only screen and (max-width: 40em) { + p, input { + background-image:url(); + } + }""" + expected = dedent(expected) + + assert ( + CssRewriter( + ArticleUrlRewriter(article_url=HttpUrl("http://kiwix.org/article")), + base_href=None, + ).rewrite(content) + == expected + ) diff --git a/tests/rewriting/test_html_rewriting.py b/tests/rewriting/test_html_rewriting.py new file mode 100644 index 00000000..bd59b497 --- /dev/null +++ b/tests/rewriting/test_html_rewriting.py @@ -0,0 +1,1555 @@ +from collections.abc import Callable +from textwrap import dedent + +import pytest + +from zimscraperlib.rewriting.css import CssRewriter +from zimscraperlib.rewriting.html import ( + AttrNameAndValue, + AttrsList, + HtmlRewriter, + HTMLRewritingRules, + extract_base_href, + format_attr, + get_attr_value_from, + rewrite_meta_http_equiv_redirect, +) +from zimscraperlib.rewriting.js import JsRewriter +from zimscraperlib.rewriting.url_rewriting import ( + ArticleUrlRewriter, + HttpUrl, + ZimPath, +) + +from .utils import ContentForTests + + +@pytest.fixture( + params=[ + ContentForTests(input_="A simple string without url"), + ContentForTests( + input_="" + "

This is a sentence with a http://exemple.com/path link

" + "" + ), + ContentForTests( + input_='A link not to rewrite' + ), + ContentForTests( + input_='

A url (relative) in a ' + "inline style

" + ), + ContentForTests(input_="

"), + ContentForTests( + input_="' + ), + ContentForTests(input_=""), + ContentForTests( + input_="""This is a sample attribute with a quote """ + "in its value and which is not a URL" + ), + ContentForTests(input_=""), + ContentForTests(input_="<script>"), + ContentForTests( + input_="

This is a smiley (🙂) and it html hex value (🙂)

" + ), + ContentForTests( + input_='' + ), + ContentForTests( + input_='" + ), + ContentForTests( + input_='' + ), + ContentForTests(input_="A simple string with doctype"), + ContentForTests(input_="A simple string with comment"), + ContentForTests(input_="A simple string with pi"), + ] +) +def no_rewrite_content(request: pytest.FixtureRequest): + yield request.param + + +def test_no_rewrite( + no_rewrite_content: ContentForTests, no_js_notify: Callable[[ZimPath], None] +): + assert ( + HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl(f"http://{no_rewrite_content.article_url}"), + ), + "", + "", + no_js_notify, + ) + .rewrite(no_rewrite_content.input_str) + .content + == no_rewrite_content.expected_str + ) + + +@pytest.fixture( + params=[ + ContentForTests( + "

A link in a inline style" + "

", + '

' + "A link in a inline style

", + ), + ContentForTests( + "

" + "A link in a inline style

", + '

' + "A link in a inline style

", + ), + ContentForTests( + "
    ", + '
      ', + ), + ] +) +def escaped_content(request: pytest.FixtureRequest): + yield request.param + + +def test_escaped_content( + escaped_content: ContentForTests, no_js_notify: Callable[[ZimPath], None] +): + transformed = ( + HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl(f"http://{escaped_content.article_url}") + ), + "", + "", + no_js_notify, + ) + .rewrite(escaped_content.input_str) + .content + ) + assert transformed == escaped_content.expected_str + + +@pytest.fixture( + params=[ + ContentForTests( + '', + ( + "" + ), + ), + ContentForTests( + '', + ( + """" + ), + ), + ContentForTests( + '', + ( + """" + ), + ), + ContentForTests( + '', + '', + ), + ContentForTests( + '', + '', + ), + ContentForTests( + '', + '', + ), + ] +) +def js_rewrites(request: pytest.FixtureRequest): + yield request.param + + +def test_js_rewrites( + js_rewrites: ContentForTests, no_js_notify: Callable[[ZimPath], None] +): + transformed = ( + HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl(f"http://{js_rewrites.article_url}") + ), + "", + "", + no_js_notify, + ) + .rewrite(js_rewrites.input_str) + .content + ) + assert transformed == js_rewrites.expected_str + + +def long_path_replace_test_content(input_: str, rewriten_url: str, article_url: str): + expected = input_.replace("http://exemple.com/a/long/path", rewriten_url) + return ContentForTests(input_, expected, article_url) + + +lprtc = long_path_replace_test_content + + +@pytest.fixture( + params=[ + # Normalized path is "exemple.com/a/long/path" + lprtc( + 'A link to rewrite', + "a/long/path", + "exemple.com", + ), + lprtc( + 'A link to rewrite', + "../exemple.com/a/long/path", + "kiwix.org", + ), + lprtc( + 'A link to rewrite', + "../exemple.com/a/long/path", + "kiwix.org/", + ), + lprtc( + 'A link to rewrite', + "a/long/path", + "exemple.com/", + ), + lprtc( + 'A link to rewrite', + "a/long/path", + "exemple.com/a", + ), + lprtc( + 'A link to rewrite', + "long/path", + "exemple.com/a/", + ), + lprtc( + 'A link to rewrite', + "long/path", + "exemple.com/a/long", + ), + lprtc( + 'A link to rewrite', + "path", + "exemple.com/a/long/", + ), + lprtc( + 'A link to rewrite', + "path", + "exemple.com/a/long/path", + ), + lprtc( + 'A link to rewrite', + ".", + "exemple.com/a/long/path/yes", + ), + lprtc( + 'A link to rewrite', + "../../long/path", + "exemple.com/a/very/long/path", + ), + lprtc( + 'A link to rewrite', + "../../exemple.com/a/long/path", + "kiwix.org/another/path", + ), + ] +) +def rewrite_url(request: pytest.FixtureRequest): + yield request.param + + +def test_rewrite(rewrite_url: ContentForTests, no_js_notify: Callable[[ZimPath], None]): + assert ( + HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl(f"http://{rewrite_url.article_url}"), + existing_zim_paths={ZimPath("exemple.com/a/long/path")}, + ), + "", + "", + no_js_notify, + ) + .rewrite(rewrite_url.input_str) + .content + == rewrite_url.expected_str + ) + + +def test_extract_title(no_js_notify: Callable[[ZimPath], None]): + content = """ + + Page title + + + Wrong page title + + """ + + assert ( + HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl("http://example.com"), + existing_zim_paths={ZimPath("exemple.com/a/long/path")}, + ), + "", + "", + no_js_notify, + ) + .rewrite(content) + .title + == "Page title" + ) + + +def test_rewrite_attributes(no_js_notify: Callable[[ZimPath], None]): + rewriter = HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl("http://kiwix.org/"), + existing_zim_paths={ZimPath("kiwix.org/foo")}, + ), + "", + "", + no_js_notify, + ) + + assert ( + rewriter.rewrite("A link").content + == 'A link' + ) + + assert ( + rewriter.rewrite("").content + == '' + ) + + assert ( + rewriter.rewrite( + "" + ).content + == '' + ) + + +def test_rewrite_css(no_js_notify: Callable[[ZimPath], None]): + output = ( + HtmlRewriter( + ArticleUrlRewriter(article_url=HttpUrl("http://kiwix.org/")), + "", + "", + no_js_notify, + ) + .rewrite( + "", + ) + .content + ) + assert ( + output == "' + ) + + +def test_head_insert(no_js_notify: Callable[[ZimPath], None]): + content = """ + + A test content + + + """ + + content = dedent(content) + + url_rewriter = ArticleUrlRewriter(article_url=HttpUrl("http://kiwix.org/")) + assert ( + HtmlRewriter(url_rewriter, "", "", no_js_notify).rewrite(content).content + == content + ) + + assert HtmlRewriter(url_rewriter, "PRE_HEAD_INSERT", "", no_js_notify).rewrite( + content + ).content == content.replace("", "PRE_HEAD_INSERT") + assert HtmlRewriter(url_rewriter, "", "POST_HEAD_INSERT", no_js_notify).rewrite( + content + ).content == content.replace("", "POST_HEAD_INSERT") + assert HtmlRewriter( + url_rewriter, "PRE_HEAD_INSERT", "POST_HEAD_INSERT", no_js_notify + ).rewrite(content).content == content.replace( + "", "PRE_HEAD_INSERT" + ).replace( + "", "POST_HEAD_INSERT" + ) + + +@pytest.mark.parametrize( + "js_src,expected_js_module_path", + [ + ("my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("./my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("../my-module-script.js", "kiwix.org/my-module-script.js"), + ("../../../my-module-script.js", "kiwix.org/my-module-script.js"), + ("/my-module-script.js", "kiwix.org/my-module-script.js"), + ("//myserver.com/my-module-script.js", "myserver.com/my-module-script.js"), + ( + "https://myserver.com/my-module-script.js", + "myserver.com/my-module-script.js", + ), + ], +) +def test_js_module_detected_script(js_src: str, expected_js_module_path: str): + + js_modules = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + HtmlRewriter( + url_rewriter=ArticleUrlRewriter( + article_url=HttpUrl("http://kiwix.org/my_folder/my_article.html") + ), + pre_head_insert="", + post_head_insert="", + notify_js_module=custom_notify, + ).rewrite(f'') + + assert len(js_modules) == 1 + assert js_modules[0].value == expected_js_module_path + + +@pytest.mark.parametrize( + "js_src,expected_js_module_path", + [ + ("my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("./my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("../my-module-script.js", "kiwix.org/my-module-script.js"), + ("../../../my-module-script.js", "kiwix.org/my-module-script.js"), + ("/my-module-script.js", "kiwix.org/my-module-script.js"), + ("//myserver.com/my-module-script.js", "myserver.com/my-module-script.js"), + ( + "https://myserver.com/my-module-script.js", + "myserver.com/my-module-script.js", + ), + ], +) +def test_js_module_detected_module_preload(js_src: str, expected_js_module_path: str): + + js_modules = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + HtmlRewriter( + url_rewriter=ArticleUrlRewriter( + article_url=HttpUrl("http://kiwix.org/my_folder/my_article.html") + ), + pre_head_insert="", + post_head_insert="", + notify_js_module=custom_notify, + ).rewrite(f'') + + assert len(js_modules) == 1 + assert js_modules[0].value == expected_js_module_path + + +@pytest.mark.parametrize( + "script_src", + [ + (''), + (''), + (''), + ], +) +def test_no_js_module_detected(script_src: str): + + js_modules = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + HtmlRewriter( + url_rewriter=ArticleUrlRewriter( + article_url=HttpUrl("http://kiwix.org/my_folder/my_article.html") + ), + pre_head_insert="", + post_head_insert="", + notify_js_module=custom_notify, + ).rewrite(script_src) + + assert len(js_modules) == 0 + + +def test_js_module_base_href_src(): + + js_modules = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + HtmlRewriter( + url_rewriter=ArticleUrlRewriter( + article_url=HttpUrl("http://kiwix.org/my_folder/my_article.html") + ), + pre_head_insert="", + post_head_insert="", + notify_js_module=custom_notify, + ).rewrite( + """ + + """ + ) + + assert len(js_modules) == 1 + assert js_modules[0].value == "kiwix.org/my_other_folder/my-module-script.js" + + +def test_js_module_base_href_inline(): + + js_modules = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + HtmlRewriter( + url_rewriter=ArticleUrlRewriter( + article_url=HttpUrl("http://kiwix.org/my_folder/my_article.html") + ), + pre_head_insert="", + post_head_insert="", + notify_js_module=custom_notify, + ).rewrite( + """ + + """ + ) + + assert len(js_modules) == 1 + assert js_modules[0].value == "kiwix.org/my_other_folder/my-module-script.js" + + +@pytest.mark.parametrize( + "html_content, expected_base_href", + [ + pytest.param("", None, id="empty_content"), + pytest.param("", None, id="empty_html"), + pytest.param( + "Foo", None, id="no_base" + ), + pytest.param( + '', "../..", id="standard_case" + ), + pytest.param( + '', "../..", id="malformed_head" + ), # malformed HTML is OK + pytest.param( + '', "../..", id="very_malformed_head" + ), # even very malformed HTML is OK + pytest.param( + '', "../..", id="base_at_root" + ), # even very malformed HTML is OK + pytest.param( + '', None, id="base_in_body" + ), # but base in body is ignored + pytest.param( + '', + "../..", + id="base_with_target_before", + ), + pytest.param( + '', + "../..", + id="base_with_target_after", + ), + pytest.param( + '', + "../..", + id="base_with_two_href", + ), + pytest.param( + '', + "../..", + id="two_bases_with_href", + ), + pytest.param( + '', + "../..", + id="href_in_second_base", + ), + pytest.param( + '' + "", + "../..", + id="href_in_second_base_second_href_ignored", + ), + ], +) +def test_extract_base_href(html_content: str, expected_base_href: str): + assert extract_base_href(html_content) == expected_base_href + + +@pytest.fixture( + params=[ + ContentForTests( + input_='' + '', + expected='', + ), + ContentForTests( + '' + '', + '', + "kiwix.org/a/index.html", + ), + ContentForTests( + '' + '', + '' + '', + ), + ContentForTests( + '' + '', + '' + '', + ), + ContentForTests( + '' + '', + '' + '', + ), + ContentForTests( + '' + '', + '' + '', + ), + ContentForTests( + '' + "" + '', + '' + '', + ), + ContentForTests( + '' + '', + '', + "kiwix.org/a/index.html", + ), + ContentForTests( + '' + '', + '' + "", + "kiwix.org/a/index.html", + ), + ContentForTests( + ' ' + '', + ' ' + "", + "kiwix.org/a/index.html", + ), + ] +) +def rewrite_base_href_content(request): + yield request.param + + +def test_rewrite_base_href( + rewrite_base_href_content: ContentForTests, no_js_notify: Callable[[ZimPath], None] +): + assert ( + HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl(f"http://{rewrite_base_href_content.article_url}"), + existing_zim_paths={ + ZimPath("kiwix.org/foo.html"), + ZimPath("kiwix.org/foo.js"), + ZimPath("kiwix.org/foo.css"), + ZimPath("kiwix.org/foo.png"), + ZimPath("kiwix.org/favicon.png"), + }, + ), + "", + "", + no_js_notify, + ) + .rewrite(rewrite_base_href_content.input_str) + .content + == rewrite_base_href_content.expected_str + ) + + +@pytest.mark.parametrize( + "input_content,expected_output", + [ + pytest.param( + """""", + """""", + id="double_quoted_attr", + ), + pytest.param( + "", + """""", + id="single_quoted_attr", + ), + pytest.param( + """""", + """""", + id="uppercase_named_reference_in_attr", + ), + pytest.param( + """""", + """""", + id="numeric_reference_in_attr", + ), + pytest.param( + """""", + """""", + id="numeric_reference_in_attr", + ), + pytest.param( + """""", + """""", + id="badly_escaped_src", + ), + ], +) +def test_simple_rewrite( + input_content: str, expected_output: str, no_js_notify: Callable[[ZimPath], None] +): + assert ( + HtmlRewriter( + ArticleUrlRewriter(article_url=HttpUrl("http://example.com")), + "", + "", + no_js_notify, + ) + .rewrite(input_content) + .content + == expected_output + ) + + +@pytest.fixture( + params=[ + ContentForTests( + """""", + ), + ContentForTests( + """""", + ), + ContentForTests( + """""", + ), + ContentForTests( + """""", + ), + ContentForTests( + """""", + ( + """""" + ), # NOTA: quotes and ampersand are escaped since we are inside HTML attr + ), + ] +) +def rewrite_onxxx_content(request: pytest.FixtureRequest): + yield request.param + + +def test_rewrite_onxxx_event( + rewrite_onxxx_content: ContentForTests, no_js_notify: Callable[[ZimPath], None] +): + assert ( + HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl(f"http://{rewrite_onxxx_content.article_url}"), + existing_zim_paths={ + ZimPath("kiwix.org/foo.html"), + ZimPath("kiwix.org/foo.js"), + ZimPath("kiwix.org/foo.css"), + ZimPath("kiwix.org/foo.png"), + ZimPath("kiwix.org/favicon.png"), + }, + ), + "", + "", + no_js_notify, + ) + .rewrite(rewrite_onxxx_content.input_str) + .content + == rewrite_onxxx_content.expected_str + ) + + +@pytest.fixture( + params=[ + ContentForTests( + 'whatever', + ), + ContentForTests( + '' + "whatever", + 'whatever', + ), + ContentForTests( + "" + '' + "whatever", + ), + ContentForTests( + "" + '' + "whatever", + "" + '' + "whatever", + ), + ContentForTests( + 'whatever', + ), # do not rewrite other tags mentionning a charset + ContentForTests( + "" + '' + "whatever", + ), # do not rewrite other http-equiv mentionning a charset + ] +) +def rewrite_meta_charset_content(request: pytest.FixtureRequest): + yield request.param + + +def test_rewrite_meta_charset( + rewrite_meta_charset_content: ContentForTests, + no_js_notify: Callable[[ZimPath], None], +): + assert ( + HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl( + f"http://{rewrite_meta_charset_content.article_url}" + ) + ), + "", + "", + no_js_notify, + ) + .rewrite(rewrite_meta_charset_content.input_str) + .content + == rewrite_meta_charset_content.expected_str + ) + + +@pytest.fixture( + params=[ + ContentForTests( + '' + "whatever", + '' + "whatever", + ), + ] +) +def rewrite_meta_http_equiv_redirect_full_content(request: pytest.FixtureRequest): + yield request.param + + +def test_rewrite_meta_http_equiv_redirect_full( + rewrite_meta_http_equiv_redirect_full_content: ContentForTests, + no_js_notify: Callable[[ZimPath], None], +): + assert ( + HtmlRewriter( + ArticleUrlRewriter( + article_url=HttpUrl( + f"http://{rewrite_meta_http_equiv_redirect_full_content.article_url}" + ), + existing_zim_paths={ZimPath("kiwix.org/somepage")}, + ), + "", + "", + no_js_notify, + ) + .rewrite(rewrite_meta_http_equiv_redirect_full_content.input_str) + .content + == rewrite_meta_http_equiv_redirect_full_content.expected_str + ) + + +rules = HTMLRewritingRules() + + +@rules.drop_attribute() +def drop_all_named_attribute(attr_name: str): + return attr_name == "all_named" + + +@rules.drop_attribute() +def drop_all_tag_name_attribute(tag: str): + return tag == "all_tag" + + +@rules.drop_attribute() +def drop_tag_name_attribute(tag: str, attr_name: str): + return tag == "drop_tag" and attr_name == "drop_name" + + +@rules.drop_attribute() +def drop_attr_name_and_value_attribute(attr_name: str, attr_value: str | None): + return ( + attr_name == "drop_value" + and attr_value is not None + and attr_value.startswith("drop") + ) + + +@rules.drop_attribute() +def drop_if_other_attribute(attr_name: str, attrs: AttrsList): + return attr_name == "drop_if_other" and any( + other_name == "other" for other_name, _ in attrs + ) + + +@pytest.mark.parametrize( + "tag, attr_name, attr_value, attrs, should_drop", + [ + pytest.param("all_tag", "foo", "bar", [], True, id="drop_by_tag_name"), + pytest.param("other_tag", "foo", "bar", [], False, id="dont_drop_by_tag_name"), + pytest.param("foo", "all_named", "bar", [], True, id="drop_by_attr_name"), + pytest.param( + "foo", "other_name", "bar", [], False, id="dont_drop_by_attr_name" + ), + pytest.param( + "drop_tag", "drop_name", "bar", [], True, id="drop_by_tag_and_attr_name" + ), + pytest.param( + "drop_tag", "foo", "bar", [], False, id="dont_drop_by_tag_and_attr_name" + ), + pytest.param("foo", "drop_value", "drop_me", [], True, id="drop_by_attr_value"), + pytest.param( + "foo", "drop_value", "dont_drop", [], False, id="dont_drop_by_attr_value" + ), + pytest.param( + "foo", "drop_value", "dont_drop", [], False, id="dont_drop_by_attr_value" + ), + pytest.param( + "foo", + "drop_if_other", + "bar", + [("foo", None), ("other", "foo"), ("bar", "foo")], + True, + id="drop_if_other", + ), + pytest.param( + "foo", + "drop_if_other", + "bar", + [("foo", None), ("bar", "foo")], + False, + id="dont_drop_if_not_other", + ), + ], +) +def test_html_drop_rules( + tag: str, + attr_name: str, + attr_value: str | None, + attrs: AttrsList, + *, + should_drop: bool, +): + assert ( + rules.do_drop_attribute( + tag=tag, attr_name=attr_name, attr_value=attr_value, attrs=attrs + ) + is should_drop + ) + + +def test_bad_html_drop_rules_argument_name(): + bad_rules = HTMLRewritingRules() + + with pytest.raises(TypeError, match="Parameter .* is unsupported in function"): + + @bad_rules.drop_attribute() + def bad_signature(foo: str) -> bool: + return foo == "bar" + + +def test_bad_html_drop_rules_argument_type(): + bad_rules = HTMLRewritingRules() + + with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"): + + @bad_rules.drop_attribute() + def bad_signature( # pyright: ignore[reportUnusedFunction] + attr_name: int, + ) -> bool: + return attr_name == 123 + + +@rules.rewrite_attribute() +def rewrite_tag_value(attr_name: str) -> AttrNameAndValue | None: + if attr_name != "aname": + return + return (attr_name, "foo") + + +@rules.rewrite_attribute() +def rewrite_tag_name(attr_name: str, attr_value: str | None) -> AttrNameAndValue | None: + if attr_name != "bad_name": + return + return ("good_name", attr_value) + + +@rules.rewrite_attribute() +def rewrite_call_notify( + attr_name: str, + notify_js_module: Callable[[ZimPath], None], +) -> AttrNameAndValue | None: + if attr_name != "call_notify": + return + notify_js_module(ZimPath("foo")) + return + + +@rules.rewrite_attribute() +def rewrite_value_with_base_href( + attr_name: str, + base_href: str | None, +) -> AttrNameAndValue | None: + if attr_name != "get_base_href": + return + return (attr_name, base_href) + + +@rules.rewrite_attribute() +def rewrite_attr2_value_with_attr1_value( + attr_name: str, + attrs: AttrsList, +) -> AttrNameAndValue | None: + if attr_name != "attr2": + return + return (attr_name, get_attr_value_from(attrs, "attr1")) + + +@pytest.mark.parametrize( + "tag, attr_name, attr_value, attrs, base_href, expected_result, should_notify", + [ + pytest.param( + "foo", + "aname", + "bar", + [], + "", + ("aname", "foo"), + False, + id="rewrite_tag_value", + ), + pytest.param( + "foo", + "bad_name", + "bar", + [], + "", + ("good_name", "bar"), + False, + id="rewrite_tag_name", + ), + pytest.param( + "foo", + "call_notify", + "bar", + [], + "", + ("call_notify", "bar"), + True, + id="call_notify", + ), + pytest.param( + "foo", + "get_base_href", + "bar", + [], + "base_href_value", + ("get_base_href", "base_href_value"), + False, + id="rewrite_value_with_base_href", + ), + pytest.param( + "foo", + "attr2", + "bar", + [("attr1", "value1")], + "base_href_value", + ("attr2", "value1"), + False, + id="rewrite_attr2_value_with_attr1_value", + ), + ], +) +def test_html_attribute_rewrite_rules( + tag: str, + attr_name: str, + attr_value: str | None, + attrs: AttrsList, + base_href: str, + expected_result: AttrNameAndValue, + *, + should_notify: bool, + simple_url_rewriter_gen: Callable[[str], ArticleUrlRewriter], + js_rewriter_gen: Callable[ + [ArticleUrlRewriter, str | None, Callable[[ZimPath], None]], JsRewriter + ], + css_rewriter_gen: Callable[[ArticleUrlRewriter, str | None], CssRewriter], +): + notified_paths: list[ZimPath] = [] + + def notify(path: ZimPath): + notified_paths.append(path) + + url_rewriter = simple_url_rewriter_gen("http://www.example.com") + js_rewriter = js_rewriter_gen(url_rewriter, base_href, notify) + css_rewriter = css_rewriter_gen(url_rewriter, base_href) + + assert ( + rules.do_attribute_rewrite( + tag=tag, + attr_name=attr_name, + attr_value=attr_value, + attrs=attrs, + js_rewriter=js_rewriter, + css_rewriter=css_rewriter, + url_rewriter=url_rewriter, + base_href=base_href, + notify_js_module=notify, + ) + == expected_result + ) + assert (len(notified_paths) > 0) == should_notify + + +def test_bad_html_attribute_rewrite_rules_argument_name(): + bad_rules = HTMLRewritingRules() + + with pytest.raises(TypeError, match="Parameter .* is unsupported in function"): + + @bad_rules.rewrite_attribute() + def bad_signature( # pyright: ignore[reportUnusedFunction] + foo: str, + ) -> AttrNameAndValue | None: + return (foo, "bar") + + +def test_bad_html_attribute_rewrite_rules_argument_type(): + bad_rules = HTMLRewritingRules() + + with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"): + + @bad_rules.rewrite_attribute() + def bad_signature( # pyright: ignore[reportUnusedFunction] + attr_name: int, + ) -> AttrNameAndValue | None: + return (f"{attr_name}", "bar") + + +@rules.rewrite_tag() +def rewrite1_tag( + tag: str, +) -> str | None: + if tag != "rewrite1": + return + return "" + + +@rules.rewrite_tag() +def rewrite2_tag( + tag: str, + attrs: AttrsList, + *, + auto_close: bool, +) -> str | None: + if tag != "rewrite2": + return + + return ( + f"' if auto_close else '>'}" + ) + + +@pytest.mark.parametrize( + "tag, attrs, auto_close, expected_result", + [ + pytest.param( + "foo", + [], + False, + None, + id="do_not_rewrite_foo_tag", + ), + pytest.param( + "rewrite1", + [("attr2", "value2")], + False, + "", + id="rewrite1_tag", + ), + pytest.param( + "rewrite2", + [("attr2", "value2")], + False, + '', + id="rewrite2_tag_no_close", + ), + pytest.param( + "rewrite2", + [("attr2", "value2")], + True, + '', + id="rewrite2_tag_auto_close", + ), + ], +) +def test_html_tag_rewrite_rules( + tag: str, + attrs: AttrsList, + *, + auto_close: bool, + expected_result: str | None, +): + assert ( + rules.do_tag_rewrite( + tag=tag, + attrs=attrs, + auto_close=auto_close, + ) + == expected_result + ) + + +def test_bad_html_tag_rewrite_rules_argument_name(): + bad_rules = HTMLRewritingRules() + + with pytest.raises(TypeError, match="Parameter .* is unsupported in function"): + + @bad_rules.rewrite_tag() + def bad_signature(foo: str) -> str: # pyright: ignore[reportUnusedFunction] + return foo + + +def test_bad_html_tag_rewrite_rules_argument_type(): + bad_rules = HTMLRewritingRules() + + with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"): + + @bad_rules.rewrite_tag() + def bad_signature(attrs: int) -> str: # pyright: ignore[reportUnusedFunction] + return f"{attrs}" + + +@rules.rewrite_data() +def rewrite_data_html_rewrite_context( + html_rewrite_context: str | None, +) -> str | None: + if html_rewrite_context != "rewrite": + return + return "rewritten data" + + +@pytest.mark.parametrize( + "html_rewrite_context, base_href, data, expected_result", + [ + pytest.param( + "foo", + "bar", + "something", + None, + id="do_not_rewrite_foo_context", + ), + pytest.param( + None, + "bar", + "something", + None, + id="do_not_rewrite_none_context", + ), + pytest.param( + "rewrite", + "bar", + "something", + "rewritten data", + id="rewrite_data_html_rewrite_context", + ), + ], +) +def test_html_data_rewrite_rules( + html_rewrite_context: str | None, + base_href: str, + data: str, + *, + expected_result: str | None, + simple_url_rewriter_gen: Callable[[str], ArticleUrlRewriter], + js_rewriter_gen: Callable[ + [ArticleUrlRewriter, str | None, Callable[[ZimPath], None]], JsRewriter + ], + css_rewriter_gen: Callable[[ArticleUrlRewriter, str | None], CssRewriter], +): + notified_paths: list[ZimPath] = [] + + def notify(path: ZimPath): + notified_paths.append(path) + + url_rewriter = simple_url_rewriter_gen("http://www.example.com") + js_rewriter = js_rewriter_gen(url_rewriter, base_href, notify) + css_rewriter = css_rewriter_gen(url_rewriter, base_href) + + assert ( + rules.do_data_rewrite( + html_rewrite_context=html_rewrite_context, + data=data, + css_rewriter=css_rewriter, + js_rewriter=js_rewriter, + url_rewriter=url_rewriter, + ) + == expected_result + ) + + +def test_bad_html_data_rewrite_rules_argument_name(): + bad_rules = HTMLRewritingRules() + + with pytest.raises(TypeError, match="Parameter .* is unsupported in function"): + + @bad_rules.rewrite_data() + def bad_signature( # pyright: ignore[reportUnusedFunction] + foo: str, + ) -> str | None: + return foo + + +def test_bad_html_data_rewrite_rules_argument_type(): + bad_rules = HTMLRewritingRules() + + with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"): + + @bad_rules.rewrite_data() + def bad_signature( # pyright: ignore[reportUnusedFunction] + data: int, + ) -> str | None: + return f"{data}" + + +@pytest.mark.parametrize( + "tag, attr_name, attr_value, attrs, expected_result", + [ + pytest.param( + "meta", + "content", + "1;url=http://www.example.com/somewhere", + [("http-equiv", "refresh")], + ("content", "1;url=http://www.example.com/somewhererewritten"), + id="nomimal_case", + ), + pytest.param( + "meta", + "content", + " 1 ; url = http://www.example.com/somewhere ", + [("http-equiv", "refresh")], + ("content", "1;url=http://www.example.com/somewhererewritten"), + id="nomimal_case_with_spaces", + ), + pytest.param( + "foo", + "content", + "1;url=http://www.example.com/somewhere", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_foo_tag", + ), + pytest.param( + "meta", + "foo", + "1;url=http://www.example.com/somewhere", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_foo_attribute", + ), + pytest.param( + "meta", + "content", + "1;url=http://www.example.com/somewhere", + [("http-equiv", "foo")], + None, + id="do_not_rewrite_http_equiv_not_refresh", + ), + pytest.param( + "meta", + "content", + "1;url=http://www.example.com/somewhere", + [], + None, + id="do_not_rewrite_no_http_equiv", + ), + pytest.param( + "meta", + "content", + None, + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_missing_attribute", + ), + pytest.param( + "meta", + "content", + "", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_empty_attribute", + ), + pytest.param( + "meta", + "content", + "1", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_attribute_without_url", + ), + pytest.param( + "meta", + "content", + "1;foo=http://www.example.com/somewhere", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_bad_attribute", + ), + ], +) +def test_rewrite_meta_http_equiv_redirect_rule( + tag: str, + attr_name: str, + attr_value: str | None, + attrs: AttrsList, + expected_result: AttrNameAndValue | None, + simple_url_rewriter_gen: Callable[[str, str], ArticleUrlRewriter], +): + url_rewriter = simple_url_rewriter_gen("http://www.example.com", "rewritten") + + assert ( + rewrite_meta_http_equiv_redirect( + tag=tag, + attr_name=attr_name, + attr_value=attr_value, + attrs=attrs, + url_rewriter=url_rewriter, + base_href=None, + ) + == expected_result + ) diff --git a/tests/rewriting/test_js_rewriting.py b/tests/rewriting/test_js_rewriting.py new file mode 100644 index 00000000..1ee7ceea --- /dev/null +++ b/tests/rewriting/test_js_rewriting.py @@ -0,0 +1,402 @@ +from collections.abc import Callable + +import pytest + +from zimscraperlib.rewriting.js import JsRewriter +from zimscraperlib.rewriting.url_rewriting import ( + ArticleUrlRewriter, + HttpUrl, + ZimPath, +) + +from .utils import ContentForTests + + +@pytest.fixture +def simple_js_rewriter( + simple_url_rewriter_gen: Callable[[str], ArticleUrlRewriter], + no_js_notify: Callable[[ZimPath], None], +) -> JsRewriter: + return JsRewriter( + url_rewriter=simple_url_rewriter_gen("http://www.example.com"), + base_href=None, + notify_js_module=no_js_notify, + ) + + +@pytest.fixture( + params=[ + "a = this;", + "return this.location", + 'func(Function("return this"));', + "'a||this||that", + "(a,b,Q.contains(i[t], this))", + "a = this.location.href; exports.Foo = Foo; /* export className */", + ] +) +def rewrite_this_js_content(request: pytest.FixtureRequest): + content = request.param + yield ContentForTests( + input_=content, + expected=content.replace( + "this", "_____WB$wombat$check$this$function_____(this)" + ), + ) + + +def test_this_js_rewrite( + simple_js_rewriter: JsRewriter, rewrite_this_js_content: ContentForTests +): + assert ( + simple_js_rewriter.rewrite(rewrite_this_js_content.input_str) + == rewrite_this_js_content.expected_str + ) + + +@pytest.fixture( + params=[ + "aaa.this.window=red", + "aaa. this.window=red", + "aaa$this.window=red", + "a = this.color;", + "return this.color", + 'func(Function("return this.color"));', + "'a||this.color||that", + "(a,b,Q.contains(i[t], this.color))", + "a = this.color.href; exports.Foo = Foo; /* export className */", + ] +) +def no_rewrite_this_js_content(request: pytest.FixtureRequest): + content = request.param + yield ContentForTests(input_=content) + + +def test_this_no_js_rewrite( + simple_js_rewriter: JsRewriter, no_rewrite_this_js_content: ContentForTests +): + assert ( + simple_js_rewriter.rewrite(no_rewrite_this_js_content.input_str) + == no_rewrite_this_js_content.expected_str + ) + + +# This test probably has to be fixed but spec is blurry +# See https://github.com/openzim/warc2zim/issues/410 +def test_this_js_rewrite_newline(simple_js_rewriter: JsRewriter): + assert ( + simple_js_rewriter.rewrite("aaa\n this.window=red") + == "aaa\n ;_____WB$wombat$check$this$function_____(this).window=red" + ) + + +def test_js_rewrite_bytes_inline(simple_js_rewriter: JsRewriter): + assert ( + simple_js_rewriter.rewrite(b"a=123;\nb=456;", opts={"inline": True}) + == "a=123; b=456;" + ) + + +def test_js_rewrite_post_message(simple_js_rewriter: JsRewriter): + assert ( + simple_js_rewriter.rewrite(b"a.postMessage(") == "a.__WB_pmw(self).postMessage(" + ) + + +class WrappedTestContent(ContentForTests): + + def __init__( + self, + input_: str | bytes, + expected: str | bytes | None = None, + article_url: str = "https://kiwix.org", + ) -> None: + super().__init__(input_=input_, expected=expected, article_url=article_url) + self.expected = self.wrap_script(self.expected_str) + + @staticmethod + def wrap_script(text: str) -> str: + """ + A small wrapper to help generate the expected content. + + JsRewriter must add this local definition around all js code (when we access on + of the local varibles) + """ + return ( + "var _____WB$wombat$assign$function_____ = function(name) {return (self." + "_wb_wombat && self._wb_wombat.local_init && self._wb_wombat.local_init" + "(name)) || self[name]; };\n" + "if (!self.__WB_pmw) { self.__WB_pmw = function(obj) { this.__WB_source =" + " obj; return this; } }\n" + "{\n" + 'let window = _____WB$wombat$assign$function_____("window");\n' + 'let globalThis = _____WB$wombat$assign$function_____("globalThis");\n' + 'let self = _____WB$wombat$assign$function_____("self");\n' + 'let document = _____WB$wombat$assign$function_____("document");\n' + 'let location = _____WB$wombat$assign$function_____("location");\n' + 'let top = _____WB$wombat$assign$function_____("top");\n' + 'let parent = _____WB$wombat$assign$function_____("parent");\n' + 'let frames = _____WB$wombat$assign$function_____("frames");\n' + 'let opener = _____WB$wombat$assign$function_____("opener");\n' + "let arguments;\n" + "\n" + f"{text}" + "\n" + "}" + ) + + +@pytest.fixture( + params=[ + WrappedTestContent( + input_="location = http://example.com/", + expected="location = ((self.__WB_check_loc && " + "self.__WB_check_loc(location, argument" + "s)) || {}).href = http://example.com/", + ), + WrappedTestContent( + input_=" location = http://example.com/2", + expected=" location = ((self.__WB_check_loc && " + "self.__WB_check_loc(location, arguments)) || {}).href = " + "http://example.com/2", + ), + WrappedTestContent(input_="func(location = 0)", expected="func(location = 0)"), + WrappedTestContent( + input_=" location = http://example.com/2", + expected=" location = ((self.__WB_check_loc && " + "self.__WB_check_loc(location, arguments)) || {}).href = " + "http://example.com/2", + ), + WrappedTestContent(input_="window.eval(a)", expected="window.eval(a)"), + WrappedTestContent( + input_="x = window.eval; x(a);", expected="x = window.eval; x(a);" + ), + WrappedTestContent( + input_="this. location = 'http://example.com/'", + expected="this. location = 'http://example.com/'", + ), + WrappedTestContent( + input_="if (self.foo) { console.log('blah') }", + expected="if (self.foo) { console.log('blah') }", + ), + WrappedTestContent(input_="window.x = 5", expected="window.x = 5"), + ] +) +def rewrite_wrapped_content(request: pytest.FixtureRequest): + yield request.param + + +def test_wrapped_rewrite( + simple_js_rewriter: JsRewriter, rewrite_wrapped_content: WrappedTestContent +): + assert ( + simple_js_rewriter.rewrite(rewrite_wrapped_content.input_str) + == rewrite_wrapped_content.expected_str + ) + + +class ImportTestContent(ContentForTests): + + def __init__( + self, + input_: str | bytes, + expected: str | bytes | None = None, + article_url: str = "https://kiwix.org", + ) -> None: + super().__init__(input_=input_, expected=expected, article_url=article_url) + self.article_url = "https://exemple.com/some/path/" + self.expected = self.wrap_import(self.expected_str) + + @staticmethod + # We want to import js stored in zim file as `_zim_static/__wb_module_decl.js` from + # `https://exemple.com/some/path/` so path is + # `../../../_zim_static/__wb_module_decl.js` + def wrap_import(text: str) -> str: + """ + A small wrapper to help us generate the expected content for modules. + + JsRewriter must add this import line at beginning of module codes (when code + contains `import` or `export`) + """ + return ( + "import { window, globalThis, self, document, location, top, parent, " + 'frames, opener } from "../../../_zim_static/__wb_module_decl.js";\n' + f"{text}" + ) + + +@pytest.fixture( + params=[ + # import rewrite + ImportTestContent( + input_="""import "foo"; + +a = this.location""", + expected="""import "foo"; + +a = _____WB$wombat$check$this$function_____(this).location""", + ), + # import/export module rewrite + ImportTestContent( + input_="""a = this.location + +export { a }; +""", + expected="""a = _____WB$wombat$check$this$function_____(this).location + +export { a }; +""", + ), + # rewrite ESM module import + ImportTestContent( + input_='import "https://example.com/file.js"', + expected='import "../../../example.com/file.js"', + ), + ImportTestContent( + input_=''' +import {A, B} + from + "https://example.com/file.js"''', + expected=''' +import {A, B} + from + "../../../example.com/file.js"''', + ), + ImportTestContent( + input_=""" +import * from "https://example.com/file.js" +import A from "http://example.com/path/file2.js"; + +import {C, D} from "./abc.js"; +import {X, Y} from "../parent.js"; +import {E, F, G} from "/path.js"; +import { Z } from "../../../path.js"; + +B = await import(somefile); +""", + expected=""" +import * from "../../../example.com/file.js" +import A from "../../../example.com/path/file2.js"; + +import {C, D} from "./abc.js"; +import {X, Y} from "../parent.js"; +import {E, F, G} from "../../path.js"; +import { Z } from "../../path.js"; + +B = await ____wb_rewrite_import__(import.meta.url, somefile); +""", + ), + ImportTestContent( + input_='import"import.js";import{A, B, C} from"test.js";(function() => ' + '{ frames[0].href = "/abc"; })', + expected='import"import.js";import{A, B, C} from"test.js";(function() => ' + '{ frames[0].href = "/abc"; })', + ), + ImportTestContent( + input_="""a = location + +export{ a, $ as b}; +""", + expected="""a = location + +export{ a, $ as b}; +""", + ), + ] +) +def rewrite_import_content(request: pytest.FixtureRequest): + yield request.param + + +def test_import_rewrite( + no_js_notify: Callable[[ZimPath], None], rewrite_import_content: ImportTestContent +): + url_rewriter = ArticleUrlRewriter( + article_url=HttpUrl(rewrite_import_content.article_url) + ) + assert ( + JsRewriter( + url_rewriter=url_rewriter, base_href=None, notify_js_module=no_js_notify + ).rewrite(rewrite_import_content.input_str, opts={"isModule": True}) + == rewrite_import_content.expected_str + ) + + +@pytest.fixture( + params=[ + "return this.abc", + "return this object", + "a = 'some, this object'", + "{foo: bar, this: other}", + "this.$location = http://example.com/", + "this. $location = http://example.com/", + "this. _location = http://example.com/", + "this. alocation = http://example.com/", + "this.location = http://example.com/", + ",eval(a)", + "this.$eval(a)", + "x = $eval; x(a);", + "obj = { eval : 1 }", + "x = obj.eval", + "x = obj.eval(a)", + "x = obj._eval(a)", + "x = obj.$eval(a)", + "if (a.self.foo) { console.log('blah') }", + "a.window.x = 5", + " postMessage({'a': 'b'})", + "simport(5);", + "a.import(5);", + "$import(5);", + "async import(val) { ... }", + """function blah() { + const text = "text: import a from B.js"; +} +""", + """function blah() { + const text = ` +import a from "https://example.com/B.js" +`; +} + +""", + "let a = 7; var b = 5; const foo = 4;\n\n", + ] +) +def no_rewrite_js_content(request: pytest.FixtureRequest): + yield request.param + + +def test_no_rewrite(simple_js_rewriter: JsRewriter, no_rewrite_js_content: str): + assert simple_js_rewriter.rewrite(no_rewrite_js_content) == no_rewrite_js_content + + +@pytest.mark.parametrize( + "js_src,expected_js_module_path", + [ + ("./my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("../my-module-script.js", "kiwix.org/my-module-script.js"), + ("../../../my-module-script.js", "kiwix.org/my-module-script.js"), + ("/my-module-script.js", "kiwix.org/my-module-script.js"), + ("//myserver.com/my-module-script.js", "myserver.com/my-module-script.js"), + ( + "https://myserver.com/my-module-script.js", + "myserver.com/my-module-script.js", + ), + ], +) +def test_js_rewrite_nested_module_detected(js_src: str, expected_js_module_path: str): + + js_modules: list[ZimPath] = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + url_rewriter = ArticleUrlRewriter( + article_url=HttpUrl("http://kiwix.org/my_folder/my_article.html") + ) + + JsRewriter( + url_rewriter=url_rewriter, base_href=None, notify_js_module=custom_notify + ).rewrite(f'import * from "{js_src}"', opts={"isModule": True}) + + assert len(js_modules) == 1 + assert js_modules[0].value == expected_js_module_path diff --git a/tests/rewriting/test_rx_replacer.py b/tests/rewriting/test_rx_replacer.py new file mode 100644 index 00000000..a2e24efe --- /dev/null +++ b/tests/rewriting/test_rx_replacer.py @@ -0,0 +1,107 @@ +import re +from collections.abc import Callable + +import pytest + +from zimscraperlib.rewriting.rx_replacer import ( + RxRewriter, + TransformationAction, + add_prefix, + add_suffix, + replace, + replace_all, + replace_prefix_from, +) + + +@pytest.fixture() +def compiled_rule() -> re.Pattern[str]: + return re.compile("") + + +@pytest.mark.parametrize( + "operation, operand1, expected_result", + [ + (add_suffix, "456", "pre456post"), + (add_prefix, "456", "pre456post"), + (replace_all, "456", "pre456post"), + ], +) +def test_actions_one( + compiled_rule: re.Pattern[str], + operation: Callable[[str], TransformationAction], + operand1: str, + expected_result: str, +): + def wrapped(operation: Callable[[str], TransformationAction], operand1: str): + def wraper(match: re.Match[str]): + return operation(operand1)(match, {}) + + return wraper + + assert ( + compiled_rule.sub(wrapped(operation, operand1), "prepost") + == expected_result + ) + + +@pytest.mark.parametrize( + "operation, operand1, operand2, expected_result", + [ + (replace_prefix_from, "456", "pl", "prepost"), + ], +) +def test_actions_two( + compiled_rule: re.Pattern[str], + operation: Callable[[str, str], TransformationAction], + operand1: str, + operand2: str, + expected_result: str, +): + def wrapped( + operation: Callable[[str, str], TransformationAction], + operand1: str, + operand2: str, + ): + def wraper(match: re.Match[str]): + return operation(operand1, operand2)(match, {}) + + return wraper + + assert ( + compiled_rule.sub(wrapped(operation, operand1, operand2), "prepost") + == expected_result + ) + + +@pytest.mark.parametrize( + "text, expected", + [ + ("prepost", "prepost"), + (b"prepost", "prepost"), + ("foo", "f456"), + ("bar", "bar"), + ("blu", "blu"), + ], +) +def test_rx_rewriter(text: str, expected: str): + rewriter = RxRewriter( + rules=[ + (re.compile("foo"), replace("oo", "456")), + (re.compile("bar"), replace("oo", "456")), + (re.compile(""), replace("pla", "123")), + ] + ) + assert rewriter.rewrite(text) == expected + + +def test_rx_rewriter_no_rules(): + rewriter = RxRewriter() + rewriter._compile_rules( + [ + (re.compile(""), replace("pla", "123")), + ] + ) + assert rewriter.rewrite("prepost") == "prepost" diff --git a/tests/rewriting/test_url_rewriting.py b/tests/rewriting/test_url_rewriting.py new file mode 100644 index 00000000..c14e9f84 --- /dev/null +++ b/tests/rewriting/test_url_rewriting.py @@ -0,0 +1,805 @@ +import pytest + +from zimscraperlib.rewriting.url_rewriting import ( + ArticleUrlRewriter, + HttpUrl, + ZimPath, +) + + +class TestNormalize: + + @pytest.mark.parametrize( + "url,zim_path", + [ + ("https://exemple.com", "exemple.com/"), + ("https://exemple.com/", "exemple.com/"), + ("http://example.com/resource", "example.com/resource"), + ("http://example.com/resource/", "example.com/resource/"), + ( + "http://example.com/resource/folder/sub.txt", + "example.com/resource/folder/sub.txt", + ), + ( + "http://example.com/resource/folder/sub", + "example.com/resource/folder/sub", + ), + ( + "http://example.com/resource/folder/sub?foo=bar", + "example.com/resource/folder/sub?foo=bar", + ), + ( + "http://example.com/resource/folder/sub?foo=bar#anchor1", + "example.com/resource/folder/sub?foo=bar", + ), + ("http://example.com/resource/#anchor1", "example.com/resource/"), + ("http://example.com/resource/?foo=bar", "example.com/resource/?foo=bar"), + ("http://example.com#anchor1", "example.com/"), + ("http://example.com?foo=bar#anchor1", "example.com/?foo=bar"), + ("http://example.com/?foo=bar", "example.com/?foo=bar"), + ("http://example.com/?foo=ba+r", "example.com/?foo=ba r"), + ( + "http://example.com/?foo=ba r", + "example.com/?foo=ba r", + ), # situation where the ` ` has not been properly escaped in document + ("http://example.com/?foo=ba%2Br", "example.com/?foo=ba+r"), + ("http://example.com/?foo=ba+%2B+r", "example.com/?foo=ba + r"), + ("http://example.com/#anchor1", "example.com/"), + ( + "http://example.com/some/path/http://example.com//some/path", + "example.com/some/path/http:/example.com/some/path", + ), + ( + "http://example.com/some/pa?th/http://example.com//some/path", + "example.com/some/pa?th/http:/example.com/some/path", + ), + ( + "http://example.com/so?me/pa?th/http://example.com//some/path", + "example.com/so?me/pa?th/http:/example.com/some/path", + ), + ("http://example.com/resource?", "example.com/resource"), + ("http://example.com/resource#", "example.com/resource"), + ("http://user@example.com/resource", "example.com/resource"), + ("http://user:password@example.com/resource", "example.com/resource"), + ("http://example.com:8080/resource", "example.com/resource"), + ( + "http://foobargooglevideo.com/videoplayback?id=1576&key=value", + "youtube.fuzzy.replayweb.page/videoplayback?id=1576", + ), # Fuzzy rule is applied in addition to path transformations + ("https://xn--exmple-cva.com", "exémple.com/"), + ("https://xn--exmple-cva.com/", "exémple.com/"), + ("https://xn--exmple-cva.com/resource", "exémple.com/resource"), + ("https://exémple.com/", "exémple.com/"), + ("https://exémple.com/resource", "exémple.com/resource"), + # host_ip is an invalid hostname according to spec + ("https://host_ip/", "host_ip/"), + ("https://host_ip/resource", "host_ip/resource"), + ("https://192.168.1.1/", "192.168.1.1/"), + ("https://192.168.1.1/resource", "192.168.1.1/resource"), + ("http://example.com/res%24urce", "example.com/res$urce"), + ( + "http://example.com/resource?foo=b%24r", + "example.com/resource?foo=b$r", + ), + ("http://example.com/resource@300x", "example.com/resource@300x"), + ("http://example.com:8080/resource", "example.com/resource"), + ("http://user@example.com:8080/resource", "example.com/resource"), + ("http://user:password@example.com:8080/resource", "example.com/resource"), + # the two URI below are an illustration of a potential collision (two + # differents URI leading to the same ZIM path) + ( + "http://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-" + "de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1%40300x.png", + "tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-" + "Solidarité-Numérique_1@300x.png", + ), + ( + "https://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-" + "de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1@300x.png", + "tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-" + "Solidarité-Numérique_1@300x.png", + ), + ], + ) + def test_normalize(self, url, zim_path): + assert ( + ArticleUrlRewriter.normalize(HttpUrl(url)).value == ZimPath(zim_path).value + ) + + def test_normalize_bad_arg( + self, + ): + with pytest.raises( + ValueError, match="Bad argument type passed, HttpUrl expected" + ): + ArticleUrlRewriter.normalize( + "https://www.acme.com" # pyright: ignore[reportArgumentType] + ) + + +class TestArticleUrlRewriter: + @pytest.mark.parametrize( + "original_content_url, expected_missing_zim_paths", + [ + ( + "foo.html", + set(), + ), + ( + "bar.html", + {ZimPath("kiwix.org/a/article/bar.html")}, + ), + ], + ) + def test_missing_zim_paths( + self, + original_content_url: str, + expected_missing_zim_paths: set[ZimPath], + ): + http_article_url = HttpUrl("https://kiwix.org/a/article/document.html") + missing_zim_paths = set() + rewriter = ArticleUrlRewriter( + article_url=http_article_url, + existing_zim_paths={ZimPath("kiwix.org/a/article/foo.html")}, + missing_zim_paths=missing_zim_paths, + ) + rewriter(original_content_url, base_href=None, rewrite_all_url=False) + assert missing_zim_paths == expected_missing_zim_paths + + @pytest.mark.parametrize( + "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "rewrite_all_url", + [ + ( + "https://kiwix.org/a/article/document.html", + "foo.html", + "foo.html", + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo.html#anchor1", + "foo.html#anchor1", + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo.html?foo=bar", + "foo.html%3Ffoo%3Dbar", + ["kiwix.org/a/article/foo.html?foo=bar"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo.html?foo=b%24ar", + "foo.html%3Ffoo%3Db%24ar", + ["kiwix.org/a/article/foo.html?foo=b$ar"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo.html?foo=b%3Far", # a query string with an encoded ? char in value + "foo.html%3Ffoo%3Db%3Far", + ["kiwix.org/a/article/foo.html?foo=b?ar"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "fo%o.html", + "fo%25o.html", + ["kiwix.org/a/article/fo%o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foé.html", # URL not matching RFC 3986 (found in invalid HTML doc) + "fo%C3%A9.html", # character is encoded so that URL match RFC 3986 + ["kiwix.org/a/article/foé.html"], # but ZIM path is non-encoded + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "./foo.html", + "foo.html", + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "../foo.html", + "https://kiwix.org/a/foo.html", # Full URL since not in known URLs + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "../foo.html", + "../foo.html", # all URLs rewrite activated + ["kiwix.org/a/article/foo.html"], + True, + ), + ( + "https://kiwix.org/a/article/document.html", + "../foo.html", + "../foo.html", + ["kiwix.org/a/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "../bar/foo.html", + "https://kiwix.org/a/bar/foo.html", # Full URL since not in known URLs + ["kiwix.org/a/article/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "../bar/foo.html", + "../bar/foo.html", # all URLs rewrite activated + ["kiwix.org/a/article/foo.html"], + True, + ), + ( + "https://kiwix.org/a/article/document.html", + "../bar/foo.html", + "../bar/foo.html", + ["kiwix.org/a/bar/foo.html"], + False, + ), + ( # we cannot go upper than host, so '../' in excess are removed + "https://kiwix.org/a/article/document.html", + "../../../../../foo.html", + "../../foo.html", + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo?param=value", + "foo%3Fparam%3Dvalue", + ["kiwix.org/a/article/foo?param=value"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo?param=value%2F", + "foo%3Fparam%3Dvalue/", + ["kiwix.org/a/article/foo?param=value/"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo?param=value%2Fend", + "foo%3Fparam%3Dvalue/end", + ["kiwix.org/a/article/foo?param=value/end"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "foo/", + "foo/", + ["kiwix.org/a/article/foo/"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo o.html", + "../../fo%20o.html", + ["kiwix.org/fo o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo+o.html", + "../../fo%2Bo.html", + ["kiwix.org/fo+o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo%2Bo.html", + "../../fo%2Bo.html", + ["kiwix.org/fo+o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/foo.html?param=val+ue", + "../../foo.html%3Fparam%3Dval%20ue", + ["kiwix.org/foo.html?param=val ue"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo~o.html", + "../../fo~o.html", + ["kiwix.org/fo~o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo-o.html", + "../../fo-o.html", + ["kiwix.org/fo-o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo_o.html", + "../../fo_o.html", + ["kiwix.org/fo_o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo%7Eo.html", # must not be encoded / must be decoded (RFC 3986 #2.3) + "../../fo~o.html", + ["kiwix.org/fo~o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo%2Do.html", # must not be encoded / must be decoded (RFC 3986 #2.3) + "../../fo-o.html", + ["kiwix.org/fo-o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo%5Fo.html", # must not be encoded / must be decoded (RFC 3986 #2.3) + "../../fo_o.html", + ["kiwix.org/fo_o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/foo%2Ehtml", # must not be encoded / must be decoded (RFC 3986 #2.3) + "../../foo.html", + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "#anchor1", + "#anchor1", + ["kiwix.org/a/article/document.html"], + False, + ), + ( + "https://kiwix.org/a/article/", + "#anchor1", + "#anchor1", + ["kiwix.org/a/article/"], + False, + ), + ( + "https://kiwix.org/a/article/", + "../article/", + "./", + ["kiwix.org/a/article/"], + False, + ), + ], + ) + def test_relative_url( + self, + article_url: str, + know_paths: list[str], + original_content_url: str, + expected_rewriten_content_url: str, + *, + rewrite_all_url: bool, + ): + http_article_url = HttpUrl(article_url) + rewriter = ArticleUrlRewriter( + article_url=http_article_url, + existing_zim_paths={ZimPath(path) for path in know_paths}, + ) + assert ( + rewriter( + original_content_url, base_href=None, rewrite_all_url=rewrite_all_url + ) + == expected_rewriten_content_url + ) + + @pytest.mark.parametrize( + "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "rewrite_all_url", + [ + ( + "https://kiwix.org/a/article/document.html", + "/foo.html", + "../../foo.html", + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/bar.html", + "https://kiwix.org/bar.html", # Full URL since not in known URLs + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/bar.html", + "../../bar.html", # all URLs rewrite activated + ["kiwix.org/foo.html"], + True, + ), + ], + ) + def test_absolute_path_url( + self, + article_url: str, + know_paths: list[str], + original_content_url: str, + expected_rewriten_content_url: str, + *, + rewrite_all_url: bool, + ): + http_article_url = HttpUrl(article_url) + rewriter = ArticleUrlRewriter( + article_url=http_article_url, + existing_zim_paths={ZimPath(path) for path in know_paths}, + ) + assert ( + rewriter( + original_content_url, base_href=None, rewrite_all_url=rewrite_all_url + ) + == expected_rewriten_content_url + ) + + @pytest.mark.parametrize( + "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "rewrite_all_url", + [ + ( + "https://kiwix.org/a/article/document.html", + "//kiwix.org/foo.html", + "../../foo.html", + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "//kiwix.org/bar.html", + "https://kiwix.org/bar.html", # Full URL since not in known URLs + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "//kiwix.org/bar.html", + "../../bar.html", # all URLs rewrite activated + ["kiwix.org/foo.html"], + True, + ), + ( + "https://kiwix.org/a/article/document.html", + "//acme.com/foo.html", + "../../../acme.com/foo.html", + ["acme.com/foo.html"], + False, + ), + ( + "http://kiwix.org/a/article/document.html", + "//acme.com/bar.html", + "http://acme.com/bar.html", # Full URL since not in known URLs + ["kiwix.org/foo.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "//acme.com/bar.html", + "../../../acme.com/bar.html", # all URLs rewrite activated + ["kiwix.org/foo.html"], + True, + ), + ( # puny-encoded host is transformed into url-encoded value + "https://kiwix.org/a/article/document.html", + "//xn--exmple-cva.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ["exémple.com/a/article/document.html"], + False, + ), + ( # host who should be puny-encoded ir transformed into url-encoded value + "https://kiwix.org/a/article/document.html", + "//exémple.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ["exémple.com/a/article/document.html"], + False, + ), + ], + ) + def test_absolute_scheme_url( + self, + article_url: str, + know_paths: list[str], + original_content_url: str, + expected_rewriten_content_url: str, + *, + rewrite_all_url: bool, + ): + http_article_url = HttpUrl(article_url) + rewriter = ArticleUrlRewriter( + article_url=http_article_url, + existing_zim_paths={ZimPath(path) for path in know_paths}, + ) + assert ( + rewriter( + original_content_url, base_href=None, rewrite_all_url=rewrite_all_url + ) + == expected_rewriten_content_url + ) + + @pytest.mark.parametrize( + "article_url, original_content_url, expected_rewriten_content_url, know_paths, " + "rewrite_all_url", + [ + ( + "https://kiwix.org/a/article/document.html", + "https://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["foo.org/a/article/document.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "http://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["foo.org/a/article/document.html"], + False, + ), + ( + "http://kiwix.org/a/article/document.html", + "https://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["foo.org/a/article/document.html"], + False, + ), + ( + "http://kiwix.org/a/article/document.html", + "https://user:password@foo.org:8080/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["foo.org/a/article/document.html"], + False, + ), + ( # Full URL since not in known URLs + "https://kiwix.org/a/article/document.html", + "https://foo.org/a/article/document.html", + "https://foo.org/a/article/document.html", + ["kiwix.org/a/article/foo/"], + False, + ), + ( # all URLs rewrite activated + "https://kiwix.org/a/article/document.html", + "https://foo.org/a/article/document.html", + "../../../foo.org/a/article/document.html", + ["kiwix.org/a/article/foo/"], + True, + ), + ( # puny-encoded host is transformed into url-encoded value + "https://kiwix.org/a/article/document.html", + "https://xn--exmple-cva.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ["exémple.com/a/article/document.html"], + False, + ), + ( # host who should be puny-encoded is transformed into url-encoded value + "https://kiwix.org/a/article/document.html", + "https://exémple.com/a/article/document.html", + "../../../ex%C3%A9mple.com/a/article/document.html", + ["exémple.com/a/article/document.html"], + False, + ), + ], + ) + def test_absolute_url( + self, + article_url: str, + know_paths: list[str], + original_content_url: str, + expected_rewriten_content_url: str, + *, + rewrite_all_url: bool, + ): + http_article_url = HttpUrl(article_url) + rewriter = ArticleUrlRewriter( + article_url=http_article_url, + existing_zim_paths={ZimPath(path) for path in know_paths}, + ) + assert ( + rewriter( + original_content_url, base_href=None, rewrite_all_url=rewrite_all_url + ) + == expected_rewriten_content_url + ) + + @pytest.mark.parametrize( + "original_content_url, rewrite_all_url", + [ + ("data:0548datacontent", False), + ("blob:exemple.com/url", False), + ("mailto:bob@acme.com", False), + ("tel:+33.1.12.12.23", False), + ("data:0548datacontent", True), + ("blob:exemple.com/url", True), + ("mailto:bob@acme.com", True), + ("tel:+33.1.12.12.23", True), + ], + ) + # other schemes are never rewritten, even when rewrite_all_url is true + def test_no_rewrite_other_schemes( + self, original_content_url: str, *, rewrite_all_url: bool + ): + article_url = HttpUrl("https://kiwix.org/a/article/document.html") + rewriter = ArticleUrlRewriter(article_url=article_url) + assert ( + rewriter( + original_content_url, base_href=None, rewrite_all_url=rewrite_all_url + ) + == original_content_url + ) + + @pytest.mark.parametrize( + "original_content_url, know_path, base_href, expected_rewriten_content_url", + [ + pytest.param( + "foo.html", + "kiwix.org/a/article/foo.html", + None, + "foo.html", + id="no_base", + ), + pytest.param( + "foo.html", + "kiwix.org/a/foo.html", + "../", + "../foo.html", + id="parent_base", + ), + pytest.param( + "foo.html", + "kiwix.org/a/bar/foo.html", + "../bar/", + "../bar/foo.html", + id="base_in_another_folder", + ), + pytest.param( + "foo.html", + "www.example.com/foo.html", + "https://www.example.com/", + "../../../www.example.com/foo.html", + id="base_on_absolute_url", + ), + ], + ) + def test_base_href( + self, + original_content_url: str, + know_path: str, + base_href: str, + expected_rewriten_content_url: str, + ): + rewriter = ArticleUrlRewriter( + article_url=HttpUrl("https://kiwix.org/a/article/document.html"), + existing_zim_paths={ZimPath(know_path)}, + ) + assert ( + rewriter(original_content_url, base_href=base_href, rewrite_all_url=False) + == expected_rewriten_content_url + ) + + +class TestHttpUrl: + + @pytest.mark.parametrize( + "http_url", + [("https://bob@acme.com"), ("http://bob@acme.com"), ("hTtPs://bob@acme.com")], + ) + def test_good_http_urls(self, http_url: str): + HttpUrl(http_url) + + @pytest.mark.parametrize( + "http_url", + [("mailto:bob@acme.com"), ("tel:+41.34.34"), ("mailto:https://bob@acme.com")], + ) + def test_bad_http_urls_scheme(self, http_url: str): + with pytest.raises(ValueError, match="Incorrect HttpUrl scheme in value"): + HttpUrl(http_url) + + def test_http_urls_eq(self): + assert HttpUrl("http://bob@acme.com") == HttpUrl("http://bob@acme.com") + + def test_http_urls_hash(self): + assert ( + HttpUrl("http://bob@acme.com").__hash__() + == HttpUrl("http://bob@acme.com").__hash__() + ) + + def test_http_urls_str(self): + assert str(HttpUrl("http://bob@acme.com")) == "HttpUrl(http://bob@acme.com)" + assert f"{HttpUrl("http://bob@acme.com")}" == "HttpUrl(http://bob@acme.com)" + + def test_bad_http_urls_no_host(self): + with pytest.raises(ValueError, match="Unsupported empty hostname in value"): + HttpUrl("https:///bob/index.html") + + def test_bad_http_urls_no_upper(self): + with pytest.raises( + ValueError, match="Unsupported upper-case chars in hostname" + ): + HttpUrl("https://aCmE.COM/index.html") + + +class TestZimPath: + + @pytest.mark.parametrize( + "path", + [ + ("content/index.html"), + ("index.html"), + ], + ) + def test_good_zim_path(self, path: str): + ZimPath(path) + + @pytest.mark.parametrize( + "path", + [ + ("https://bob@acme.com"), + ("http://bob@acme.com"), + ("mailto:bob@acme.com"), + ("tel:+41.34.34"), + ("mailto:https://bob@acme.com"), + ], + ) + def test_bad_zim_path_scheme(self, path: str): + with pytest.raises(ValueError, match="Unexpected scheme in value"): + ZimPath(path) + + @pytest.mark.parametrize( + "path", + [ + ("//acme.com/content/index.html"), + ], + ) + def test_bad_zim_path_hostname(self, path: str): + with pytest.raises(ValueError, match="Unexpected hostname in value"): + ZimPath(path) + + @pytest.mark.parametrize( + "path", + [ + ("//bob@/content/index.html"), + ], + ) + def test_bad_zim_path_user(self, path: str): + with pytest.raises(ValueError, match="Unexpected username in value"): + ZimPath(path) + + @pytest.mark.parametrize( + "path", + [ + ("//:pass@/content/index.html"), + ], + ) + def test_bad_zim_path_pass(self, path: str): + with pytest.raises(ValueError, match="Unexpected password in value"): + ZimPath(path) + + def test_zim_path_eq(self): + assert ZimPath("content/index.html") == ZimPath("content/index.html") + + def test_zim_path_hash(self): + assert ( + ZimPath("content/index.html").__hash__() + == ZimPath("content/index.html").__hash__() + ) + + def test_zim_path_str(self): + assert str(ZimPath("content/index.html")) == "ZimPath(content/index.html)" + assert f"{ZimPath("content/index.html")}" == "ZimPath(content/index.html)" diff --git a/tests/rewriting/utils.py b/tests/rewriting/utils.py new file mode 100644 index 00000000..ee320bd5 --- /dev/null +++ b/tests/rewriting/utils.py @@ -0,0 +1,35 @@ +class ContentForTests: + + def __init__( + self, + input_: str | bytes, + expected: str | bytes | None = None, + article_url: str = "kiwix.org", + ) -> None: + self.input_ = input_ + self.expected = expected if expected is not None else input_ + self.article_url = article_url + + @property + def input_str(self) -> str: + if isinstance(self.input_, str): + return self.input_ + raise ValueError("Input value is not a str.") + + @property + def input_bytes(self) -> bytes: + if isinstance(self.input_, bytes): + return self.input_ + raise ValueError("Input value is not a bytes.") + + @property + def expected_str(self) -> str: + if isinstance(self.expected, str): + return self.expected + raise ValueError("Expected value is not a str.") + + @property + def expected_bytes(self) -> bytes: + if isinstance(self.expected, bytes): + return self.expected + raise ValueError("Expected value is not a bytes.") diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 2d641691..17425f02 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -425,9 +425,10 @@ def do_GET(self): fpath = tmp_path / "test.zim" try: - with tempfile.TemporaryDirectory() as tmp_dir, Creator( - fpath, "" - ).config_dev_metadata() as creator: + with ( + tempfile.TemporaryDirectory() as tmp_dir, + Creator(fpath, "").config_dev_metadata() as creator, + ): tmp_dir = pathlib.Path(tmp_dir) # noqa: PLW2901 creator.add_item( URLItem( diff --git a/yarn.lock b/yarn.lock new file mode 100644 index 00000000..22071e3f --- /dev/null +++ b/yarn.lock @@ -0,0 +1,248 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +"@babel/code-frame@^7.0.0", "@babel/code-frame@^7.25.7": + version "7.25.7" + resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.25.7.tgz#438f2c524071531d643c6f0188e1e28f130cebc7" + integrity sha512-0xZJFNE5XMpENsgfHYTw8FbX4kv53mFLn2i3XPoq69LyhYSCBJtitaHx9QnsVTrsogI4Z3+HtEfZ2/GFPOtf5g== + dependencies: + "@babel/highlight" "^7.25.7" + picocolors "^1.0.0" + +"@babel/generator@^7.25.7": + version "7.25.7" + resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.25.7.tgz#de86acbeb975a3e11ee92dd52223e6b03b479c56" + integrity sha512-5Dqpl5fyV9pIAD62yK9P7fcA768uVPUyrQmqpqstHWgMma4feF1x/oFysBCVZLY5wJ2GkMUCdsNDnGZrPoR6rA== + dependencies: + "@babel/types" "^7.25.7" + "@jridgewell/gen-mapping" "^0.3.5" + "@jridgewell/trace-mapping" "^0.3.25" + jsesc "^3.0.2" + +"@babel/helper-string-parser@^7.25.7": + version "7.25.7" + resolved "https://registry.yarnpkg.com/@babel/helper-string-parser/-/helper-string-parser-7.25.7.tgz#d50e8d37b1176207b4fe9acedec386c565a44a54" + integrity sha512-CbkjYdsJNHFk8uqpEkpCvRs3YRp9tY6FmFY7wLMSYuGYkrdUi7r2lc4/wqsvlHoMznX3WJ9IP8giGPq68T/Y6g== + +"@babel/helper-validator-identifier@^7.25.7": + version "7.25.7" + resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.7.tgz#77b7f60c40b15c97df735b38a66ba1d7c3e93da5" + integrity sha512-AM6TzwYqGChO45oiuPqwL2t20/HdMC1rTPAesnBCgPCSF1x3oN9MVUwQV2iyz4xqWrctwK5RNC8LV22kaQCNYg== + +"@babel/highlight@^7.25.7": + version "7.25.7" + resolved "https://registry.yarnpkg.com/@babel/highlight/-/highlight-7.25.7.tgz#20383b5f442aa606e7b5e3043b0b1aafe9f37de5" + integrity sha512-iYyACpW3iW8Fw+ZybQK+drQre+ns/tKpXbNESfrhNnPLIklLbXr7MYJ6gPEd0iETGLOK+SxMjVvKb/ffmk+FEw== + dependencies: + "@babel/helper-validator-identifier" "^7.25.7" + chalk "^2.4.2" + js-tokens "^4.0.0" + picocolors "^1.0.0" + +"@babel/parser@^7.25.7", "@babel/parser@^7.7.0": + version "7.25.8" + resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.25.8.tgz#f6aaf38e80c36129460c1657c0762db584c9d5e2" + integrity sha512-HcttkxzdPucv3nNFmfOOMfFf64KgdJVqm1KaCm25dPGMLElo9nsLvXeJECQg8UzPuBGLyTSA0ZzqCtDSzKTEoQ== + dependencies: + "@babel/types" "^7.25.8" + +"@babel/template@^7.25.7": + version "7.25.7" + resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.25.7.tgz#27f69ce382855d915b14ab0fe5fb4cbf88fa0769" + integrity sha512-wRwtAgI3bAS+JGU2upWNL9lSlDcRCqD05BZ1n3X2ONLH1WilFP6O1otQjeMK/1g0pvYcXC7b/qVUB1keofjtZA== + dependencies: + "@babel/code-frame" "^7.25.7" + "@babel/parser" "^7.25.7" + "@babel/types" "^7.25.7" + +"@babel/traverse@^7.7.0": + version "7.25.7" + resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.25.7.tgz#83e367619be1cab8e4f2892ef30ba04c26a40fa8" + integrity sha512-jatJPT1Zjqvh/1FyJs6qAHL+Dzb7sTb+xr7Q+gM1b+1oBsMsQQ4FkVKb6dFlJvLlVssqkRzV05Jzervt9yhnzg== + dependencies: + "@babel/code-frame" "^7.25.7" + "@babel/generator" "^7.25.7" + "@babel/parser" "^7.25.7" + "@babel/template" "^7.25.7" + "@babel/types" "^7.25.7" + debug "^4.3.1" + globals "^11.1.0" + +"@babel/types@^7.25.7", "@babel/types@^7.25.8", "@babel/types@^7.7.0": + version "7.25.8" + resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.25.8.tgz#5cf6037258e8a9bcad533f4979025140cb9993e1" + integrity sha512-JWtuCu8VQsMladxVz/P4HzHUGCAwpuqacmowgXFs5XjxIgKuNjnLokQzuVjlTvIzODaDmpjT3oxcC48vyk9EWg== + dependencies: + "@babel/helper-string-parser" "^7.25.7" + "@babel/helper-validator-identifier" "^7.25.7" + to-fast-properties "^2.0.0" + +"@jridgewell/gen-mapping@^0.3.5": + version "0.3.5" + resolved "https://registry.yarnpkg.com/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz#dcce6aff74bdf6dad1a95802b69b04a2fcb1fb36" + integrity sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg== + dependencies: + "@jridgewell/set-array" "^1.2.1" + "@jridgewell/sourcemap-codec" "^1.4.10" + "@jridgewell/trace-mapping" "^0.3.24" + +"@jridgewell/resolve-uri@^3.1.0": + version "3.1.2" + resolved "https://registry.yarnpkg.com/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz#7a0ee601f60f99a20c7c7c5ff0c80388c1189bd6" + integrity sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw== + +"@jridgewell/set-array@^1.2.1": + version "1.2.1" + resolved "https://registry.yarnpkg.com/@jridgewell/set-array/-/set-array-1.2.1.tgz#558fb6472ed16a4c850b889530e6b36438c49280" + integrity sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A== + +"@jridgewell/sourcemap-codec@^1.4.10", "@jridgewell/sourcemap-codec@^1.4.14": + version "1.5.0" + resolved "https://registry.yarnpkg.com/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz#3188bcb273a414b0d215fd22a58540b989b9409a" + integrity sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ== + +"@jridgewell/trace-mapping@^0.3.24", "@jridgewell/trace-mapping@^0.3.25": + version "0.3.25" + resolved "https://registry.yarnpkg.com/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz#15f190e98895f3fc23276ee14bc76b675c2e50f0" + integrity sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ== + dependencies: + "@jridgewell/resolve-uri" "^3.1.0" + "@jridgewell/sourcemap-codec" "^1.4.14" + +ansi-styles@^3.2.1: + version "3.2.1" + resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-3.2.1.tgz#41fbb20243e50b12be0f04b8dedbf07520ce841d" + integrity sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA== + dependencies: + color-convert "^1.9.0" + +babel-eslint@^10.1.0: + version "10.1.0" + resolved "https://registry.yarnpkg.com/babel-eslint/-/babel-eslint-10.1.0.tgz#6968e568a910b78fb3779cdd8b6ac2f479943232" + integrity sha512-ifWaTHQ0ce+448CYop8AdrQiBsGrnC+bMgfyKFdi6EsPLTAWG+QfyDeM6OH+FmWnKvEq5NnBMLvlBUPKQZoDSg== + dependencies: + "@babel/code-frame" "^7.0.0" + "@babel/parser" "^7.7.0" + "@babel/traverse" "^7.7.0" + "@babel/types" "^7.7.0" + eslint-visitor-keys "^1.0.0" + resolve "^1.12.0" + +chalk@^2.4.2: + version "2.4.2" + resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" + integrity sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ== + dependencies: + ansi-styles "^3.2.1" + escape-string-regexp "^1.0.5" + supports-color "^5.3.0" + +color-convert@^1.9.0: + version "1.9.3" + resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-1.9.3.tgz#bb71850690e1f136567de629d2d5471deda4c1e8" + integrity sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg== + dependencies: + color-name "1.1.3" + +color-name@1.1.3: + version "1.1.3" + resolved "https://registry.yarnpkg.com/color-name/-/color-name-1.1.3.tgz#a7d0558bd89c42f795dd42328f740831ca53bc25" + integrity sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw== + +debug@^4.3.1: + version "4.3.7" + resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.7.tgz#87945b4151a011d76d95a198d7111c865c360a52" + integrity sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ== + dependencies: + ms "^2.1.3" + +escape-string-regexp@^1.0.5: + version "1.0.5" + resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz#1b61c0562190a8dff6ae3bb2cf0200ca130b86d4" + integrity sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg== + +eslint-visitor-keys@^1.0.0: + version "1.3.0" + resolved "https://registry.yarnpkg.com/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz#30ebd1ef7c2fdff01c3a4f151044af25fab0523e" + integrity sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ== + +function-bind@^1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.2.tgz#2c02d864d97f3ea6c8830c464cbd11ab6eab7a1c" + integrity sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA== + +globals@^11.1.0: + version "11.12.0" + resolved "https://registry.yarnpkg.com/globals/-/globals-11.12.0.tgz#ab8795338868a0babd8525758018c2a7eb95c42e" + integrity sha512-WOBp/EEGUiIsJSp7wcv/y6MO+lV9UoncWqxuFfm8eBwzWNgyfBd6Gz+IeKQ9jCmyhoH99g15M3T+QaVHFjizVA== + +has-flag@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/has-flag/-/has-flag-3.0.0.tgz#b5d454dc2199ae225699f3467e5a07f3b955bafd" + integrity sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw== + +hasown@^2.0.2: + version "2.0.2" + resolved "https://registry.yarnpkg.com/hasown/-/hasown-2.0.2.tgz#003eaf91be7adc372e84ec59dc37252cedb80003" + integrity sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ== + dependencies: + function-bind "^1.1.2" + +is-core-module@^2.13.0: + version "2.15.1" + resolved "https://registry.yarnpkg.com/is-core-module/-/is-core-module-2.15.1.tgz#a7363a25bee942fefab0de13bf6aa372c82dcc37" + integrity sha512-z0vtXSwucUJtANQWldhbtbt7BnL0vxiFjIdDLAatwhDYty2bad6s+rijD6Ri4YuYJubLzIJLUidCh09e1djEVQ== + dependencies: + hasown "^2.0.2" + +js-tokens@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499" + integrity sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ== + +jsesc@^3.0.2: + version "3.0.2" + resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-3.0.2.tgz#bb8b09a6597ba426425f2e4a07245c3d00b9343e" + integrity sha512-xKqzzWXDttJuOcawBt4KnKHHIf5oQ/Cxax+0PWFG+DFDgHNAdi+TXECADI+RYiFUMmx8792xsMbbgXj4CwnP4g== + +ms@^2.1.3: + version "2.1.3" + resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.3.tgz#574c8138ce1d2b5861f0b44579dbadd60c6615b2" + integrity sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA== + +path-parse@^1.0.7: + version "1.0.7" + resolved "https://registry.yarnpkg.com/path-parse/-/path-parse-1.0.7.tgz#fbc114b60ca42b30d9daf5858e4bd68bbedb6735" + integrity sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw== + +picocolors@^1.0.0: + version "1.1.1" + resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-1.1.1.tgz#3d321af3eab939b083c8f929a1d12cda81c26b6b" + integrity sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA== + +resolve@^1.12.0: + version "1.22.8" + resolved "https://registry.yarnpkg.com/resolve/-/resolve-1.22.8.tgz#b6c87a9f2aa06dfab52e3d70ac8cde321fa5a48d" + integrity sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw== + dependencies: + is-core-module "^2.13.0" + path-parse "^1.0.7" + supports-preserve-symlinks-flag "^1.0.0" + +supports-color@^5.3.0: + version "5.5.0" + resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-5.5.0.tgz#e2e69a44ac8772f78a1ec0b35b689df6530efc8f" + integrity sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow== + dependencies: + has-flag "^3.0.0" + +supports-preserve-symlinks-flag@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz#6eda4bd344a3c94aea376d4cc31bc77311039e09" + integrity sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w== + +to-fast-properties@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/to-fast-properties/-/to-fast-properties-2.0.0.tgz#dc5e698cbd079265bc73e0377681a4e4e83f616e" + integrity sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==