diff --git a/.github/workflows/Publish.yaml b/.github/workflows/Publish.yaml
index b7e1859b..507a6314 100644
--- a/.github/workflows/Publish.yaml
+++ b/.github/workflows/Publish.yaml
@@ -1,20 +1,101 @@
-name: Build and upload to PyPI
+name: Build and publish to PyPI
on:
release:
types: [published]
jobs:
- publish:
- runs-on: ubuntu-22.04
+ generate-rules:
+ runs-on: ubuntu-24.04
+
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version-file: pyproject.toml
+ architecture: x64
+
+ - name: Install dependencies (and project)
+ run: |
+ pip install -U pip
+ pip install -e .[scripts]
+
+ - name: Generate rules
+ run: |
+ python rules/generate_rules.py
+
+ - name: Save rules artifact
+ uses: actions/upload-artifact@v4
+ with:
+ path: |
+ src/zimscraperlib/rewriting/rules.py
+ tests/rewriting/test_fuzzy_rules.py
+ javascript/src/fuzzyRules.js
+ javascript/test/fuzzyRules.js
+ name: rules
+ retention-days: 1
+
+ build-js:
+ runs-on: ubuntu-24.04
+ needs: generate-rules
+
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Restore rules artifact
+ uses: actions/download-artifact@v4
+ with:
+ name: rules
+
+ - name: Setup Node.JS
+ uses: actions/setup-node@v4
+ with:
+ node-version-file: 'javascript/package.json'
+
+ - name: Install JS dependencies
+ run: yarn install
+ working-directory: javascript
+
+ - name: Build production JS
+ run: yarn build-prod
+ working-directory: javascript
+
+ - name: Save wombat-setup artifact
+ uses: actions/upload-artifact@v4
+ with:
+ path: javascript/dist/wombatSetup.js
+ name: wombat-setup
+ retention-days: 1
+
+ publish-python:
+ runs-on: ubuntu-24.04
+ needs:
+ - generate-rules # to have proper Python rules files (src and tests)
+ - build-js # to have proper wombatSetup.js (needs to be included in sdist)
permissions:
- id-token: write # mandatory for PyPI trusted publishing
+ id-token: write # mandatory for PyPI trusted publishing
steps:
- - uses: actions/checkout@v3
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Restore rules artifact
+ uses: actions/download-artifact@v4
+ with:
+ name: rules
+
+ - name: Restore wombat-setup artifact
+ uses: actions/download-artifact@v4
+ with:
+ name: wombat-setup
+ path: src/zimscraperlib/rewriting/statics/wombatSetup.js
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version-file: pyproject.toml
architecture: x64
@@ -24,5 +105,44 @@ jobs:
pip install -U pip build
python -m build --sdist --wheel
- - name: Upload to PyPI
+ - name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1.8
+# OPTIONAL PUBLICATION TO NPM, NOT NEEDED BY SCRAPERS IN THE END
+
+# publish-js:
+# runs-on: ubuntu-24.04
+# needs:
+# - generate-rules
+
+# steps:
+# - name: Checkout repo
+# uses: actions/checkout@v4
+
+# - name: Restore rules artifact
+# uses: actions/download-artifact@v4
+# with:
+# name: rules
+
+# - name: Setup Node.JS
+# uses: actions/setup-node@v4
+# with:
+# node-version-file: 'javascript/package.json'
+# registry-url: 'https://registry.npmjs.org' # Setup .npmrc file to publish to npm
+
+# - name: Install JS dependencies
+# run: yarn install
+# working-directory: javascript
+
+# - name: Build production JS
+# run: yarn build-prod
+# working-directory: javascript
+
+# - name: Build JS package
+# run: yarn pack
+# working-directory: javascript
+
+# - name: Publish to NPM
+# run: npm publish $(ls *.tgz) --provenance --access public
+# working-directory: javascript
+# env:
+# NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
diff --git a/.github/workflows/PublishDev.yaml b/.github/workflows/PublishDev.yaml
new file mode 100644
index 00000000..8d3ef01b
--- /dev/null
+++ b/.github/workflows/PublishDev.yaml
@@ -0,0 +1,47 @@
+name: Publish dev wombat-setup
+
+on:
+ push:
+ branches:
+ - main
+
+jobs:
+ publish-dev-wombat-setup:
+ runs-on: ubuntu-24.04
+
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version-file: pyproject.toml
+ architecture: x64
+
+ - name: Install dependencies (and project)
+ run: |
+ pip install -U pip
+ pip install -e .[scripts]
+
+ - name: Generate rules
+ run: |
+ python rules/generate_rules.py
+
+ - name: Setup Node.JS
+ uses: actions/setup-node@v4
+ with:
+ node-version-file: 'javascript/package.json'
+ registry-url: 'https://registry.npmjs.org'
+
+ - name: Install JS dependencies
+ run: yarn install
+ working-directory: javascript
+
+ - name: Build production JS
+ run: yarn build-prod
+ working-directory: javascript
+
+ - name: Upload wombatSetup.js to dev drive
+ run: |
+ curl -f -u "${{ secrets.DEV_DRIVE_WEBDAV_CREDENTIALS }}" -T javascript/dist/wombatSetup.js -sw '%{http_code}' "https://dev.kiwix.org/zimscraperlib/"
diff --git a/.github/workflows/QA.yaml b/.github/workflows/QA.yaml
index 48ccee5a..31064c2f 100644
--- a/.github/workflows/QA.yaml
+++ b/.github/workflows/QA.yaml
@@ -7,14 +7,54 @@ on:
- main
jobs:
- check-qa:
- runs-on: ubuntu-22.04
+ generate-rules:
+ runs-on: ubuntu-24.04
steps:
- - uses: actions/checkout@v3
+ - name: Checkout repo
+ uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
+ with:
+ python-version-file: pyproject.toml
+ architecture: x64
+
+ - name: Install dependencies (and project)
+ run: |
+ pip install -U pip
+ pip install -e .[scripts]
+
+ - name: Generate rules
+ run: |
+ python rules/generate_rules.py
+
+ - name: Save rules artifact
+ uses: actions/upload-artifact@v4
+ with:
+ path: |
+ src/zimscraperlib/rewriting/rules.py
+ tests/rewriting/test_fuzzy_rules.py
+ javascript/src/fuzzyRules.js
+ javascript/test/fuzzyRules.js
+ name: rules
+ retention-days: 1
+
+ check-python-qa:
+ runs-on: ubuntu-24.04
+ needs: generate-rules
+
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Restore rules artifact
+ uses: actions/download-artifact@v4
+ with:
+ name: rules
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
with:
python-version-file: pyproject.toml
architecture: x64
@@ -32,3 +72,33 @@ jobs:
- name: Check pyright
run: inv check-pyright
+
+ check-javascript-qa:
+ runs-on: ubuntu-24.04
+ needs: generate-rules
+
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Restore rules artifact
+ uses: actions/download-artifact@v4
+ with:
+ name: rules
+
+ - name: Setup Node.JS
+ uses: actions/setup-node@v4
+ with:
+ node-version-file: 'javascript/package.json'
+
+ - name: Install JS dependencies
+ working-directory: javascript
+ run: yarn install
+
+ - name: Check prettier formatting
+ working-directory: javascript
+ run: yarn prettier-check
+
+ - name: Check eslint rules
+ working-directory: javascript
+ run: yarn eslint
diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml
index 0fd2de44..66e647fc 100644
--- a/.github/workflows/Tests.yaml
+++ b/.github/workflows/Tests.yaml
@@ -7,23 +7,59 @@ on:
- main
jobs:
- run-tests:
- strategy:
- matrix:
- os: [ubuntu-22.04]
- python: ["3.8", "3.9", "3.10", "3.11", "3.12"]
- runs-on: ${{ matrix.os }}
+ generate-rules:
+ runs-on: ubuntu-24.04
steps:
- - uses: actions/checkout@v3
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version-file: pyproject.toml
+ architecture: x64
+
+ - name: Install dependencies (and project)
+ run: |
+ pip install -U pip
+ pip install -e .[scripts]
+
+ - name: Generate rules
+ run: |
+ python rules/generate_rules.py
+
+ - name: Save rules artifact
+ uses: actions/upload-artifact@v4
+ with:
+ path: |
+ src/zimscraperlib/rewriting/rules.py
+ tests/rewriting/test_fuzzy_rules.py
+ javascript/src/fuzzyRules.js
+ javascript/test/fuzzyRules.js
+ name: rules
+ retention-days: 1
+
+ run-python-tests:
+ runs-on: ubuntu-24.04
+ needs: generate-rules
+
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Restore rules artifact
+ uses: actions/download-artifact@v4
+ with:
+ name: rules
- name: install ffmpeg and gifsicle
run: sudo apt update && sudo apt install ffmpeg gifsicle
- - name: Set up Python ${{ matrix.python }}
- uses: actions/setup-python@v4
+ - name: Set up Python 3.12
+ uses: actions/setup-python@v5
with:
- python-version: ${{ matrix.python }}
+ python-version-file: pyproject.toml
architecture: x64
- name: Install dependencies (and project)
@@ -35,24 +71,50 @@ jobs:
run: inv coverage --args "--runslow --runinstalled -vvv"
- name: Upload coverage report to codecov
- if: matrix.python == '3.12'
- uses: codecov/codecov-action@v3
+ uses: codecov/codecov-action@v4
with:
fail_ci_if_error: true
token: ${{ secrets.CODECOV_TOKEN }}
- build_python:
- runs-on: ubuntu-22.04
- steps:
- - uses: actions/checkout@v3
-
- - name: Set up Python
- uses: actions/setup-python@v4
- with:
- python-version-file: pyproject.toml
- architecture: x64
-
- name: Ensure we can build Python targets
run: |
pip install -U pip build
python3 -m build --sdist --wheel
+
+ run-js-tests:
+ runs-on: ubuntu-24.04
+ needs: generate-rules
+
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Restore rules artifact
+ uses: actions/download-artifact@v4
+ with:
+ name: rules
+
+ - name: Setup Node.JS
+ uses: actions/setup-node@v4
+ with:
+ node-version-file: 'javascript/package.json'
+
+ - name: Install JS dependencies
+ run: yarn install
+ working-directory: javascript
+
+ - name: Run JS tests
+ working-directory: javascript
+ run: yarn test
+
+ - name: Ensure we can build development JS
+ run: yarn build-dev
+ working-directory: javascript
+
+ - name: Ensure we can build production JS
+ run: yarn build-prod
+ working-directory: javascript
+
+ - name: Ensure we can build JS package
+ run: yarn pack
+ working-directory: javascript
diff --git a/.gitignore b/.gitignore
index 288bff6b..15586154 100644
--- a/.gitignore
+++ b/.gitignore
@@ -252,3 +252,16 @@ $RECYCLE.BIN/
# ignore all vscode, this is not standard configuration in this place
.vscode
src/libzim-stubs
+javascript/node_modules
+
+# rule files are generated by rules/generate_rules.py
+src/zimscraperlib/rewriting/rules.py
+tests/rewriting/test_fuzzy_rules.py
+javascript/src/fuzzyRules.js
+javascript/test/fuzzyRules.js
+
+# wombatSetup.js is generated with rollup
+src/zimscraperlib/rewriting/statics/wombatSetup.js
+
+# wombat.js is installed from online source
+src/zimscraperlib/rewriting/statics/wombat.js
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e527d87f..8302a4ae 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,27 +2,34 @@
# See https://pre-commit.com/hooks.html for more hooks
exclude: ^tests/files # these are raw test files, no need to mess with them
repos:
-- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.5.0
- hooks:
- - id: trailing-whitespace
- - id: end-of-file-fixer
-- repo: https://github.com/psf/black
- rev: "24.4.2"
- hooks:
- - id: black
-- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.4.9
- hooks:
- - id: ruff
-- repo: https://github.com/RobertCraigie/pyright-python
- rev: v1.1.368
- hooks:
- - id: pyright
- name: pyright (system)
- description: 'pyright static type checker'
- entry: pyright
- language: system
- 'types_or': [python, pyi]
- require_serial: true
- minimum_pre_commit_version: '2.9.2'
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v5.0.0
+ hooks:
+ - id: trailing-whitespace
+ - id: end-of-file-fixer
+ - repo: https://github.com/psf/black
+ rev: '24.10.0'
+ hooks:
+ - id: black
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.7.0
+ hooks:
+ - id: ruff
+ - repo: https://github.com/RobertCraigie/pyright-python
+ rev: v1.1.385
+ hooks:
+ - id: pyright
+ name: pyright (system)
+ description: 'pyright static type checker'
+ entry: pyright
+ language: system
+ 'types_or': [python, pyi]
+ require_serial: true
+ minimum_pre_commit_version: '2.9.2'
+ - repo: https://github.com/pre-commit/mirrors-prettier
+ rev: v3.1.0
+ hooks:
+ - id: prettier
+ args:
+ - --config
+ - javascript/.prettierrc.json
diff --git a/README.md b/README.md
index 835832d2..dcf4f1be 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
-zimscraperlib
-=============
+# zimscraperlib
[](https://github.com/openzim/python-scraperlib/actions?query=branch%3Amain)
[](https://www.codefactor.io/repository/github/openzim/python-scraperlib)
@@ -12,24 +11,26 @@ Collection of python code to re-use across python-based scrapers
# Usage
-* This library is meant to be installed via PyPI ([`zimscraperlib`](https://pypi.org/project/zimscraperlib/)).
-* Make sure to reference it using a version code as the API is subject to frequent changes.
-* API should remain the same only within the same *minor* version.
+- This library is meant to be installed via PyPI ([`zimscraperlib`](https://pypi.org/project/zimscraperlib/)).
+- Make sure to reference it using a version code as the API is subject to frequent changes.
+- API should remain the same only within the same _minor_ version.
Example usage:
-``` pip
+```pip
zimscraperlib>=1.1,<1.2
```
+See [functional architecture](docs/functional_architecture.md), [software architecture](docs/software_architecture.md) and [technical architecture](docs/technical_architecture.md) for more details on scraperlib (not all aspects are covered yet, this is a WIP).
+
# Dependencies
-* libmagic
-* wget
-* libzim (auto-installed, not available on Windows)
-* Pillow
-* FFmpeg
-* gifsicle (>=1.92)
+- libmagic
+- wget
+- libzim (auto-installed, not available on Windows)
+- Pillow
+- FFmpeg
+- gifsicle (>=1.92)
## macOS
@@ -47,6 +48,7 @@ sudo apt install libmagic1 wget ffmpeg \
```
## Alpine
+
```
apk add ffmpeg gifsicle libmagic wget libjpeg
```
@@ -69,15 +71,15 @@ invoke coverage
Non-exhaustive list of scrapers using it (check status when updating API):
-* [openzim/freecodecamp](https://github.com/openzim/freecodecamp)
-* [openzim/gutenberg](https://github.com/openzim/gutenberg)
-* [openzim/ifixit](https://github.com/openzim/ifixit)
-* [openzim/kolibri](https://github.com/openzim/kolibri)
-* [openzim/nautilus](https://github.com/openzim/nautilus)
-* [openzim/nautilus](https://github.com/openzim/nautilus)
-* [openzim/openedx](https://github.com/openzim/openedx)
-* [openzim/sotoki](https://github.com/openzim/sotoki)
-* [openzim/ted](https://github.com/openzim/ted)
-* [openzim/warc2zim](https://github.com/openzim/warc2zim)
-* [openzim/wikihow](https://github.com/openzim/wikihow)
-* [openzim/youtube](https://github.com/openzim/youtube)
+- [openzim/freecodecamp](https://github.com/openzim/freecodecamp)
+- [openzim/gutenberg](https://github.com/openzim/gutenberg)
+- [openzim/ifixit](https://github.com/openzim/ifixit)
+- [openzim/kolibri](https://github.com/openzim/kolibri)
+- [openzim/nautilus](https://github.com/openzim/nautilus)
+- [openzim/nautilus](https://github.com/openzim/nautilus)
+- [openzim/openedx](https://github.com/openzim/openedx)
+- [openzim/sotoki](https://github.com/openzim/sotoki)
+- [openzim/ted](https://github.com/openzim/ted)
+- [openzim/warc2zim](https://github.com/openzim/warc2zim)
+- [openzim/wikihow](https://github.com/openzim/wikihow)
+- [openzim/youtube](https://github.com/openzim/youtube)
diff --git a/docs/functional_architecture.md b/docs/functional_architecture.md
new file mode 100644
index 00000000..c6f99564
--- /dev/null
+++ b/docs/functional_architecture.md
@@ -0,0 +1,92 @@
+# Functional Architecture
+
+## Enrich libzim functions
+
+zimscraperlib has primitives to enrich libzim functions with some operations which are known to be shared across scrapers. See `zim` module.
+
+## Handle videos
+
+zimscraperlib has primitives to manipulate videos with some operations which are known to be shared across scrapers. See `video` module.
+
+## Handle pictures
+
+zimscraperlib has primitives to manipulate pictures with some operations which are known to be shared across scrapers. See `image` module.
+
+## Store and rewrite mostly unmodified HTML, CSS and JS from online website
+
+zimscraperlib also contains primitives to rewrite HTML, CSS and JS fetched online, to proper operate within a ZIM without heavy modifications. While originaly developped for warc2zim, some of these primitives are now also used for mindtouch scraper and others might follow, so they are shared in zimscraperlib. See `rewriting` module.
+
+### ZIM storage
+
+While storing web resources in a ZIM is mostly straightforward (we just transfer the raw bytes, after some modification for URL rewriting if needed), the decision of the path where the resource will be stored is very important.
+
+This is purely conventional, even if ZIM specification has to be respected for proper operation in readers.
+
+This function is responsible to compute the ZIM path where a given web resource is going to be stored.
+
+While the URL is the only driver of this computation for now, zimscraperlib might have to consider other contextual data in the future. E.g. the resource to serve might by dynamic, depending not only on URL query parameters but also header(s) value(s).
+
+### Fuzzy rules
+
+Unfortunately, it is not always possible / desirable to store the resource with a simple transformation.
+
+A typical situation is that some query parameters are dynamically computed by some Javascript code to include user tracking identifier, current datetime information, ...
+
+When running again the same javascript code inside the ZIM, the URL will hence be slightly different because context has changed, but the same content needs to be retrieved.
+
+zimscraperlib hence relies on fuzzy rules to transform/simplify some URLs when computing the ZIM path.
+
+### URL Rewriting
+
+zimscraperlib transforms (rewrites) URLs found in documents (HTML, CSS, JS, ...) so that they are usable inside the ZIM.
+
+#### General case
+
+One simple example is that we might have following code in an HTML document to load an image with an absolute URL:
+
+```
+
+```
+
+The URL `https://en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` has to be transformed to a URL that it is usable inside the ZIM.
+
+For proper reader operation, openZIM prohibits using absolute URLs, so this has to be a relative URL. This relative URL is hence dependant on the location of the resource currently being rewriten.
+
+The table below gives some examples of what the rewritten URL is going to be, depending on the URL of the rewritten document.
+
+| HTML document URL | image URL rewritten for usage inside the ZIM |
+| ------------------------------------- | ---------------------------------------------------- |
+| `https://en.wikipedia.org/wiki/Kiwix` | `./File:Kiwix_logo_v3.svg` |
+| `https://en.wikipedia.org/wiki` | `./wiki/File:Kiwix_logo_v3.svg` |
+| `https://en.wikipedia.org/waka/Kiwix` | `../wiki/File:Kiwix_logo_v3.svg` |
+| `https://fr.wikipedia.org/wiki/Kiwix` | `../../en.wikipedia.org/wiki/File:Kiwix_logo_v3.svg` |
+
+As can be seen on the last line (but this is true for all URLs), this rewriting has to take into account the convention saying at which ZIM path a given web resource will be stored.
+
+#### Dynamic case
+
+The explanation above more or less assumed that the transformations can be done statically, i.e zimscraperlib can open every known document, find existing URLs and replace them with their counterpart inside the ZIM.
+
+While this is possible for HTML and CSS documents typically, it is not possible when the URL is dynamically computed. This is typically the case for JS documents, where in the general case the URL is not statically stored inside the JS code but computed on-the-fly by aggregating various strings and values.
+
+Rewriting these computations is not deemed feasible due to the huge variety of situation which might be encountered.
+
+A specific function is hence needed to rewrite URL **live in client browser**, intercept any function triggering a web request, transform the URL according to conventions (where we expect the resource to be located in the general case) and fuzzy rules.
+
+_Spoiler: this is where we will rely on wombat.js from webrecorder team, since this dynamic interception is quite complex and already done quite neatly by them_
+
+#### Fuzzy rules
+
+The same fuzzy rules that have been used to compute the ZIM path from a resource URL have to be applied again when rewriting URLs.
+
+While this is expected to serve mostly for the dynamic case, we still applies them on both side (staticaly and dynamicaly) for coherency.
+
+### Documents rewriten statically
+
+For now zimscraperlib rewrites HTML, CSS and JS documents. For CSS and JS, this mainly consists in replacing URLs. For HTML, we also have more specific rewritting necessary (e.g. to handle base href or redirects with meta).
+
+No domain specific (DS) rules are applied like it is done in wabac.JS because these rules are already applied in Browsertrix Crawler. For the same reason, JSON is not rewritten anymore (URL do not need to be rewritten in JSON because these URLs will be used by JS, intercepted by wombat and dynamically rewritten).
+
+JSONP callbacks are supposed to be rewritten but this has not been heavily tested.
+
+Other types of documents are supposed to be either not feasible / not worth it (e.g. URLs inside PDF documents), meaningless (e.g. images, fonts) or planned for later due to limited usage in the wild (e.g. XML).
diff --git a/docs/software_architecture.md b/docs/software_architecture.md
new file mode 100644
index 00000000..03dd49a2
--- /dev/null
+++ b/docs/software_architecture.md
@@ -0,0 +1,27 @@
+# Software architecture
+
+Currently only HTML, CSS and JS rewriting is described in this document.
+
+## HTML rewriting
+
+HTML rewriting is purely static (i.e. before resources are written to the ZIM). HTML code is parsed with the [HTML parser from Python standard library](https://docs.python.org/3/library/html.parser.html).
+
+A small header script is inserted in HTML code to initialize wombat.js which will wrap all JS APIs to dynamically rewrite URLs comming from JS.
+
+This header script is generated using [Jinja2](https://pypi.org/project/Jinja2/) template since it needs to populate some JS context variables needed by wombat.js operations (original scheme, original url, ...).
+
+## CSS rewriting
+
+CSS rewriting is purely static (i.e. before resources are written to the ZIM). CSS code is parsed with the [tinycss2 Python library](https://pypi.org/project/tinycss2/).
+
+## JS rewriting
+
+### Static
+
+Static JS rewriting is simply a matter of pure textual manipulation with regular expressions. No parsing is done at all.
+
+### Dynamic
+
+Dynamic JS rewriting is done with [wombat JS library](https://github.com/webrecorder/wombat). The same fuzzy rules that are used for static rewritting are injected into wombat configuration. Code to rewrite URLs is an adapted version of the code used to compute ZIM paths.
+
+For wombat setup, including the URL rewriting part, we need to pass wombat configuration info. This code is developed in the `javascript` folder. For URL parsing, it relies on the [uri-js library](https://www.npmjs.com/package/uri-js). This javascript code is bundled into a single `wombatSetup.js` file with [rollup bundler](https://rollupjs.org), the same bundler used by webrecorder team to bundle wombat.
diff --git a/docs/technical_architecture.md b/docs/technical_architecture.md
new file mode 100644
index 00000000..d3b9a394
--- /dev/null
+++ b/docs/technical_architecture.md
@@ -0,0 +1,56 @@
+# Technical architecture
+
+Currently only HTML, CSS and JS rewriting is described in this document.
+
+## Fuzzy rules
+
+Fuzzy rules are stored in `rules/rules.yaml`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code.
+
+Should you update these fuzzy rules, you hence have to:
+
+- regenerate Python and JS files by running `python rules/generateRules.py`
+- bundle again Javascript `wombatSetup.js` (see below).
+
+## Wombat configuration
+
+Wombat configuration contains some static configuration and the dynamic URL rewriting, including fuzzy rules.
+
+It is bundled by rollup with `cd javascript && yarn build-prod` and the result is pushed to proper scraper location for inclusion at build time.
+
+Tests are available and run with `cd javascript && yarn test`.
+
+## Transformation of URL into ZIM path
+
+Transforming a URL into a ZIM path has to respect the ZIM specification: path must not be url-encoded (i.e. it must be decoded) and it must be stored as UTF-8.
+
+WARC record stores the items URL inside a header named "WARC-Target-URI". The value inside this header is encoded, or more exactly it is "exactly what the browser sent at the HTTP level" (see https://github.com/webrecorder/browsertrix-crawler/issues/492 for more details).
+
+It has been decided (by convention) that we will drop the scheme, the port, the username and password from the URL. Headers are also not considered in this computation.
+
+Computation of the ZIM path is hence mostly straightforward:
+
+- decode the hostname which is puny-encoded
+- decode the path and query parameter which might be url-encoded
+
+## URL rewriting
+
+In addition to the computation of the relative path from the current document URL to the URL to rewrite, URL rewriting also consists in computing the proper ZIM path (with same operation as above) and properly encoding it so that the resulting URL respects [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986). Some important stuff has to be noted in this encoding.
+
+- since the original hostname is now part of the path, it will now be url-encoded
+- since the `?` and following query parameters are also part of the path (we do not want readers to drop them like kiwix-serve would do), they are also url-encoded
+
+Below is an example case of the rewrite operation on an image URL found in an HTML document.
+
+- Document original URL: `https://kiwix.org/a/article/document.html`
+- Document ZIM path: `kiwix.org/a/article/document.html`
+- Image original URL: `//xn--exmple-cva.com/a/resource/image.png?foo=bar`
+- Image rewritten URL: `../../../ex%C3%A9mple.com/a/resource/image.png%3Ffoo%3Dbar`
+- Image ZIM Path: `exémple.com/a/resource/image.png?foo=bar`
+
+## JS Rewriting
+
+JS Rewriting is a bit special because rules to apply are different wether we are using "classic" Javascript or "module" Javascript.
+
+Detection of Javascript modules starts at the HTML level where we have a `
+
+
+{% endautoescape %}
+
+
diff --git a/src/zimscraperlib/rewriting/url_rewriting.py b/src/zimscraperlib/rewriting/url_rewriting.py
new file mode 100644
index 00000000..fbf0147d
--- /dev/null
+++ b/src/zimscraperlib/rewriting/url_rewriting.py
@@ -0,0 +1,424 @@
+""" URL rewriting tools
+
+This module is about url and entry path rewriting.
+
+The global scheme is the following:
+
+Entries are stored in the ZIM file using their decoded fully decoded path:
+- The full path is the full url without the scheme, username, password, port, fragment
+ (ie : "/(? None:
+ HttpUrl.check_validity(value)
+ self._value = value
+
+ def __eq__(self, __value: object) -> bool:
+ return isinstance(__value, HttpUrl) and __value.value == self.value
+
+ def __hash__(self) -> int:
+ return self.value.__hash__()
+
+ def __str__(self) -> str:
+ return f"HttpUrl({self.value})"
+
+ def __repr__(self) -> str:
+ return f"{self.__str__} - {super().__repr__()}" # pragma: no cover
+
+ @property
+ def value(self) -> str:
+ return self._value
+
+ @classmethod
+ def check_validity(cls, value: str) -> None:
+ parts = urlsplit(value)
+
+ if parts.scheme.lower() not in ["http", "https"]:
+ raise ValueError(
+ f"Incorrect HttpUrl scheme in value: {value} {parts.scheme}"
+ )
+
+ if not parts.hostname:
+ raise ValueError(f"Unsupported empty hostname in value: {value}")
+
+ if parts.hostname.lower() not in value:
+ raise ValueError(f"Unsupported upper-case chars in hostname : {value}")
+
+
+class ZimPath:
+ """A utility class representing a ZIM path, usefull to pass this data around
+
+ Includes a basic validation, ensuring that path does start with scheme, hostname,...
+ """
+
+ def __init__(self, value: str) -> None:
+ ZimPath.check_validity(value)
+ self._value = value
+
+ def __eq__(self, __value: object) -> bool:
+ return isinstance(__value, ZimPath) and __value.value == self.value
+
+ def __hash__(self) -> int:
+ return self.value.__hash__()
+
+ def __str__(self) -> str:
+ return f"ZimPath({self.value})"
+
+ def __repr__(self) -> str:
+ return f"{self.__str__} - {super().__repr__()}" # pragma: no cover
+
+ @property
+ def value(self) -> str:
+ return self._value
+
+ @classmethod
+ def check_validity(cls, value: str) -> None:
+ parts = urlsplit(value)
+
+ if parts.scheme:
+ raise ValueError(f"Unexpected scheme in value: {value} {parts.scheme}")
+
+ if parts.hostname:
+ raise ValueError(f"Unexpected hostname in value: {value} {parts.hostname}")
+
+ if parts.username:
+ raise ValueError(f"Unexpected username in value: {value} {parts.username}")
+
+ if parts.password:
+ raise ValueError(f"Unexpected password in value: {value} {parts.password}")
+
+
+class ArticleUrlRewriter:
+ """
+ Rewrite urls in article.
+
+ This is typically used to rewrite urls found in an HTML document, but can be used
+ beyong that usage.
+ """
+
+ additional_rules: ClassVar[list[AdditionalRule]] = COMPILED_FUZZY_RULES
+
+ def __init__(
+ self,
+ *,
+ article_url: HttpUrl,
+ article_path: ZimPath | None = None,
+ existing_zim_paths: set[ZimPath] | None = None,
+ missing_zim_paths: set[ZimPath] | None = None,
+ ):
+ """
+ Initialise the rewriter
+
+ Args:
+ article_url: URL where the original document was located, used to resolve
+ relative URLS which will be passed
+ existing_zim_paths: list of ZIM paths which are known to exist, useful if one
+ wants to rewrite the URL to a local one only if item exists in the ZIM
+ missing_zim_paths: list of ZIM paths which are known to already be missing
+ from the existing_zim_paths ; usefull only in complement with this variable ;
+ new missing entries will be added as URLs are normalized in this function
+
+ Results:
+ items_to_download: populated with the list of rewritten URLs, so that one
+ might use it to download items after rewriting the document
+ """
+ self.article_path = article_path or ArticleUrlRewriter.normalize(article_url)
+ self.article_url = article_url
+ self.existing_zim_paths = existing_zim_paths
+ self.missing_zim_paths = missing_zim_paths
+ self.items_to_download: dict[ZimPath, HttpUrl] = {}
+
+ def get_item_path(self, item_url: str, base_href: str | None) -> ZimPath:
+ """Utility to transform an item URL into a ZimPath"""
+
+ item_absolute_url = urljoin(
+ urljoin(self.article_url.value, base_href), item_url
+ )
+ return ArticleUrlRewriter.normalize(HttpUrl(item_absolute_url))
+
+ def __call__(
+ self,
+ item_url: str,
+ base_href: str | None,
+ *,
+ rewrite_all_url: bool = True,
+ ) -> str:
+ """Rewrite a url contained in a article.
+
+ The url is "fully" rewrited to point to a normalized entry path
+ """
+
+ try:
+ item_url = item_url.strip()
+
+ # Make case of standalone fragments more straightforward
+ if item_url.startswith("#"):
+ return item_url
+
+ item_scheme = urlsplit(item_url).scheme
+ if item_scheme and item_scheme not in ("http", "https"):
+ return item_url
+
+ item_absolute_url = urljoin(
+ urljoin(self.article_url.value, base_href), item_url
+ )
+
+ item_fragment = urlsplit(item_absolute_url).fragment
+
+ item_path = ArticleUrlRewriter.normalize(HttpUrl(item_absolute_url))
+
+ if rewrite_all_url or (
+ self.existing_zim_paths and item_path in self.existing_zim_paths
+ ):
+ if item_path not in self.items_to_download:
+ self.items_to_download[item_path] = HttpUrl(item_absolute_url)
+ return self.get_document_uri(item_path, item_fragment)
+ else:
+ if (
+ self.missing_zim_paths is not None
+ and item_path not in self.missing_zim_paths
+ ):
+ logger.debug(f"WARNING {item_path} ({item_url}) not in archive.")
+ # maintain a collection of missing Zim Path to not fill the logs
+ # with duplicate messages
+ self.missing_zim_paths.add(item_path)
+ # The url doesn't point to a known entry
+ return item_absolute_url
+
+ except Exception as exc: # pragma: no cover
+ item_scheme = (
+ item_scheme # pyright: ignore[reportPossiblyUnboundVariable]
+ if "item_scheme" in locals()
+ else ""
+ )
+ item_absolute_url = (
+ item_absolute_url # pyright: ignore[reportPossiblyUnboundVariable]
+ if "item_absolute_url" in locals()
+ else ""
+ )
+ item_fragment = (
+ item_fragment # pyright: ignore[reportPossiblyUnboundVariable]
+ if "item_fragment" in locals()
+ else ""
+ )
+ item_path = (
+ item_path # pyright: ignore[reportPossiblyUnboundVariable]
+ if "item_path" in locals()
+ else ""
+ )
+ logger.debug(
+ f"Invalid URL value found in {self.article_url.value}, kept as-is. "
+ f"(item_url: {item_url}, "
+ f"item_scheme: {item_scheme}, "
+ f"item_absolute_url: {item_absolute_url}, "
+ f"item_fragment: {item_fragment}, "
+ f"item_path: {item_path}, "
+ f"rewrite_all_url: {rewrite_all_url}",
+ exc_info=exc,
+ )
+ return item_url
+
+ def get_document_uri(self, item_path: ZimPath, item_fragment: str) -> str:
+ """Given an ZIM item path and its fragment, get the URI to use in document
+
+ This function transforms the path of a ZIM item we want to adress from current
+ document (HTML / JS / ...) and returns the corresponding URI to use.
+
+ It computes the relative path based on current document location and escape
+ everything which needs to be to transform the ZIM path into a valid RFC 3986 URI
+
+ It also append a potential trailing item fragment at the end of the resulting
+ URI.
+
+ """
+ item_parts = urlsplit(item_path.value)
+
+ # item_path is both path + querystring, both will be url-encoded in the document
+ # so that readers consider them as a whole and properly pass them to libzim
+ item_url = item_parts.path
+ if item_parts.query:
+ item_url += "?" + item_parts.query
+ relative_path = str(
+ PurePosixPath(item_url).relative_to(
+ (
+ PurePosixPath(self.article_path.value)
+ if self.article_path.value.endswith("/")
+ else PurePosixPath(self.article_path.value).parent
+ ),
+ walk_up=True,
+ )
+ )
+ # relative_to removes a potential last '/' in the path, we add it back
+ if item_path.value.endswith("/"):
+ relative_path += "/"
+
+ return (
+ f"{quote(relative_path, safe='/')}"
+ f"{'#' + item_fragment if item_fragment else ''}"
+ )
+
+ @classmethod
+ def apply_additional_rules(cls, uri: HttpUrl | str) -> str:
+ """Apply additional rules on a URL or relative path
+
+ First matching additional rule matching the input value is applied and its
+ result is returned.
+
+ If no additional rule is matching, the input is returned as-is.
+ """
+ value = uri.value if isinstance(uri, HttpUrl) else uri
+ for rule in cls.additional_rules:
+ if match := rule.match.match(value):
+ return match.expand(rule.replace)
+ return value
+
+ @classmethod
+ def normalize(cls, url: HttpUrl) -> ZimPath:
+ """Transform a HTTP URL into a ZIM path to use as a entry's key.
+
+ According to RFC 3986, a URL allows only a very limited set of characters, so we
+ assume by default that the url is encoded to match this specification.
+
+ The transformation rewrites the hostname, the path and the querystring.
+
+ The transformation drops the URL scheme, username, password, port and fragment:
+ - we suppose there is no conflict of URL scheme or port: there is no two
+ ressources with same hostname, path and querystring but different URL scheme or
+ port leading to different content
+ - we consider username/password port are purely authentication mechanism which
+ have no impact on the content to server
+ - we know that the fragment is never passed to the server, it stays in the
+ User-Agent, so if we encounter a fragment while normalizing a URL found in a
+ document, it won't make its way to the ZIM anyway and will stay client-side
+
+ The transformation consists mainly in decoding the three components so that ZIM
+ path is not encoded at all, as required by the ZIM specification.
+
+ Decoding is done differently for the hostname (decoded with puny encoding) and
+ the path and querystring (both decoded with url decoding).
+
+ The final transformation is the application of fuzzy rules (sourced from wabac)
+ to transform some URLs into replay URLs and drop some useless stuff.
+
+ Returned value is a ZIM path, without any puny/url encoding applied, ready to be
+ passed to python-libzim for UTF-8 encoding.
+ """
+
+ if not isinstance(url, HttpUrl):
+ raise ValueError("Bad argument type passed, HttpUrl expected")
+
+ url_parts = urlsplit(url.value)
+
+ if not url_parts.hostname:
+ # cannot happen because of the HttpUrl checks, but important to please the
+ # type checker
+ raise Exception("Hostname is missing") # pragma: no cover
+
+ # decode the hostname if it is punny-encoded
+ hostname = (
+ idna.decode(url_parts.hostname)
+ if url_parts.hostname.startswith("xn--")
+ else url_parts.hostname
+ )
+
+ path = url_parts.path
+
+ if path:
+ # unquote the path so that it is stored unencoded in the ZIM as required by
+ # ZIM specification
+ path = unquote(path)
+ else:
+ # if path is empty, we need a "/" to remove ambiguities, e.g.
+ # https://example.com and https://example.com/ must all lead to the same ZIM
+ # entry to match RFC 3986 section 6.2.3:
+ # https://www.rfc-editor.org/rfc/rfc3986#section-6.2.3
+ path = "/"
+
+ query = url_parts.query
+
+ # if query is missing, we do not add it at all, not even a trailing ? without
+ # anything after it
+ if url_parts.query:
+ # `+`` in query parameter must be decoded as space first to remove
+ # ambiguities between a space (encoded as `+` in url query parameter) and a
+ # real plus sign (encoded as %2B but soon decoded in ZIM path)
+ query = query.replace("+", " ")
+ # unquote the query so that it is stored unencoded in the ZIM as required by
+ # ZIM specification
+ query = "?" + unquote(query)
+ else:
+ query = ""
+
+ fuzzified_url = ArticleUrlRewriter.apply_additional_rules(
+ f"{hostname}{ArticleUrlRewriter._remove_subsequent_slashes(path)}{ArticleUrlRewriter._remove_subsequent_slashes(query)}"
+ )
+
+ return ZimPath(fuzzified_url)
+
+ @classmethod
+ def _remove_subsequent_slashes(cls, value: str) -> str:
+ """Remove all successive occurence of a slash `/` in a given string
+
+ E.g `val//ue` or `val///ue` or `val////ue` (and so on) are transformed into
+ `value`
+ """
+ return re.sub(r"//+", "/", value)
diff --git a/src/zimscraperlib/zim/_libkiwix.py b/src/zimscraperlib/zim/_libkiwix.py
index c20357c8..02cae889 100644
--- a/src/zimscraperlib/zim/_libkiwix.py
+++ b/src/zimscraperlib/zim/_libkiwix.py
@@ -16,10 +16,9 @@
import io
from collections import namedtuple
-from typing import Dict
MimetypeAndCounter = namedtuple("MimetypeAndCounter", ["mimetype", "value"])
-CounterMap = Dict[
+CounterMap = dict[
type(MimetypeAndCounter.mimetype), type(MimetypeAndCounter.value) # pyright: ignore
]
diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py
index 0dab5029..a4558b8d 100644
--- a/src/zimscraperlib/zim/creator.py
+++ b/src/zimscraperlib/zim/creator.py
@@ -264,7 +264,7 @@ def convert_and_check_metadata(
Also checks that final type is appropriate for libzim (str or bytes)
"""
- if name == "Date" and isinstance(value, (datetime.date, datetime.datetime)):
+ if name == "Date" and isinstance(value, datetime.date | datetime.datetime):
value = value.strftime("%Y-%m-%d")
if (
name == "Tags"
diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py
index a7625b07..e1f9e9b2 100644
--- a/src/zimscraperlib/zim/items.py
+++ b/src/zimscraperlib/zim/items.py
@@ -129,7 +129,7 @@ def get_contentprovider(self) -> libzim.writer.ContentProvider:
# content was set manually
content = getattr(self, "content", None)
if content is not None:
- if not isinstance(content, (str, bytes)):
+ if not isinstance(content, str | bytes):
raise AttributeError(f"Unexpected type for content: {type(content)}")
return StringProvider(content=content, ref=self)
@@ -155,7 +155,7 @@ def _get_auto_index(self):
# content was set manually
content = getattr(self, "content", None)
if content is not None:
- if not isinstance(content, (str, bytes)):
+ if not isinstance(content, str | bytes):
raise RuntimeError(
f"Unexpected type for content: {type(content)}"
) # pragma: no cover
diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py
index 3db12c7b..411e42f6 100644
--- a/src/zimscraperlib/zim/metadata.py
+++ b/src/zimscraperlib/zim/metadata.py
@@ -60,7 +60,7 @@ def validate_title(name: str, value: str):
def validate_date(name: str, value: datetime.datetime | datetime.date | str):
"""ensures Date metadata can be casted to an ISO 8601 string"""
if name == "Date":
- if not isinstance(value, (datetime.datetime, datetime.date, str)):
+ if not isinstance(value, datetime.datetime | datetime.date | str):
raise ValueError(f"Invalid type for {name}: {type(value)}")
elif isinstance(value, str):
match = re.match(r"(?P\d{4})-(?P\d{2})-(?P\d{2})", value)
diff --git a/src/zimscraperlib/zim/providers.py b/src/zimscraperlib/zim/providers.py
index 2c384ddb..a4748cbb 100644
--- a/src/zimscraperlib/zim/providers.py
+++ b/src/zimscraperlib/zim/providers.py
@@ -13,7 +13,7 @@
import io
import pathlib
-from typing import Generator
+from collections.abc import Generator
import libzim.writer # pyright: ignore
import requests
diff --git a/tests/rewriting/__init__.py b/tests/rewriting/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/rewriting/conftest.py b/tests/rewriting/conftest.py
new file mode 100644
index 00000000..390dd471
--- /dev/null
+++ b/tests/rewriting/conftest.py
@@ -0,0 +1,100 @@
+from collections.abc import Callable, Generator
+
+import pytest
+
+from zimscraperlib.rewriting.css import CssRewriter
+from zimscraperlib.rewriting.js import JsRewriter
+from zimscraperlib.rewriting.url_rewriting import (
+ ArticleUrlRewriter,
+ HttpUrl,
+ ZimPath,
+)
+
+
+@pytest.fixture(scope="module")
+def no_js_notify():
+ """Fixture to not care about notification of detection of a JS file"""
+
+ def no_js_notify_handler(_: str):
+ pass
+
+ yield no_js_notify_handler
+
+
+class SimpleUrlRewriter(ArticleUrlRewriter):
+ """Basic URL rewriter mocking most calls"""
+
+ def __init__(self, article_url: HttpUrl, suffix: str = ""):
+ self.article_url = article_url
+ self.suffix = suffix
+
+ def __call__(
+ self,
+ item_url: str,
+ base_href: str | None, # noqa: ARG002
+ *,
+ rewrite_all_url: bool = True, # noqa: ARG002
+ ) -> str:
+ return item_url + self.suffix
+
+ def get_item_path(
+ self, item_url: str, base_href: str | None # noqa: ARG002
+ ) -> ZimPath:
+ return ZimPath("")
+
+ def get_document_uri(
+ self, item_path: ZimPath, item_fragment: str # noqa: ARG002
+ ) -> str:
+ return ""
+
+
+@pytest.fixture(scope="module")
+def simple_url_rewriter_gen() -> (
+ Generator[Callable[[str], ArticleUrlRewriter], None, None]
+):
+ """Fixture to create a basic url rewriter returning URLs as-is"""
+
+ def get_simple_url_rewriter(url: str, suffix: str = "") -> ArticleUrlRewriter:
+ return SimpleUrlRewriter(HttpUrl(url), suffix=suffix)
+
+ yield get_simple_url_rewriter
+
+
+@pytest.fixture(scope="module")
+def js_rewriter_gen() -> Generator[
+ Callable[[ArticleUrlRewriter, str | None, Callable[[ZimPath], None]], JsRewriter],
+ None,
+ None,
+]:
+ """Fixture to create a basic url rewriter returning URLs as-is"""
+
+ def get_js_rewriter(
+ url_rewriter: ArticleUrlRewriter,
+ base_href: str | None,
+ notify_js_module: Callable[[ZimPath], None],
+ ) -> JsRewriter:
+ return JsRewriter(
+ url_rewriter=url_rewriter,
+ base_href=base_href,
+ notify_js_module=notify_js_module,
+ )
+
+ yield get_js_rewriter
+
+
+@pytest.fixture(scope="module")
+def css_rewriter_gen() -> (
+ Generator[Callable[[ArticleUrlRewriter, str | None], CssRewriter], None, None]
+):
+ """Fixture to create a basic url rewriter returning URLs as-is"""
+
+ def get_css_rewriter(
+ url_rewriter: ArticleUrlRewriter,
+ base_href: str | None,
+ ) -> CssRewriter:
+ return CssRewriter(
+ url_rewriter=url_rewriter,
+ base_href=base_href,
+ )
+
+ yield get_css_rewriter
diff --git a/tests/rewriting/test_css_rewriting.py b/tests/rewriting/test_css_rewriting.py
new file mode 100644
index 00000000..a43ce849
--- /dev/null
+++ b/tests/rewriting/test_css_rewriting.py
@@ -0,0 +1,218 @@
+from textwrap import dedent
+
+import pytest
+
+from zimscraperlib.rewriting.css import CssRewriter
+from zimscraperlib.rewriting.url_rewriting import ArticleUrlRewriter, HttpUrl
+
+from .utils import ContentForTests
+
+
+@pytest.fixture(
+ params=[
+ ContentForTests(input_=b"p { color: red; }"),
+ ContentForTests(input_=b"p {\n color: red;\n}"),
+ ContentForTests(input_=b"p { background: blue; }"),
+ ContentForTests(input_=b"p { background: rgb(15, 0, 52); }"),
+ ContentForTests(
+ input_=b"/* See bug issue at http://exemple.com/issue/link */ "
+ b"p { color: blue; }"
+ ),
+ ContentForTests(
+ input_=b"p { width= } div { background: url(http://exemple.com/img.png)}",
+ expected=b"p { width= } div { background: url(../exemple.com/img.png)}",
+ ),
+ ContentForTests(
+ input_=b"p { width= } div { background: url('http://exemple.com/img.png')}",
+ expected=b'p { width= } div { background: url("../exemple.com/img.png")}',
+ ),
+ ContentForTests(
+ input_=b'p { width= } div { background: url("http://exemple.com/img.png")}',
+ expected=b'p { width= } div { background: url("../exemple.com/img.png")}',
+ ),
+ ]
+)
+def no_rewrite_content(request: pytest.FixtureRequest):
+ yield request.param
+
+
+def test_no_rewrite(no_rewrite_content: ContentForTests):
+ assert (
+ CssRewriter(
+ ArticleUrlRewriter(
+ article_url=HttpUrl(f"http://{no_rewrite_content.article_url}")
+ ),
+ base_href=None,
+ ).rewrite(no_rewrite_content.input_bytes)
+ == no_rewrite_content.expected_bytes.decode()
+ )
+
+
+def test_no_rewrite_str():
+ test_css = "p {\n color: red;\n}"
+ assert (
+ CssRewriter(
+ ArticleUrlRewriter(article_url=HttpUrl("http://kiwix.org")),
+ base_href=None,
+ ).rewrite(test_css)
+ == test_css
+ )
+
+
+@pytest.fixture(
+ params=[
+ ContentForTests(input_='"border:'),
+ ContentForTests(input_="border: solid 1px #c0c0c0; width= 100%"),
+ # Despite being invalid, tinycss parse it as "width" property without value.
+ ContentForTests(input_="width:", expected="width:;"),
+ ContentForTests(
+ input_="border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"
+ ),
+ ContentForTests(
+ input_='background: url("http://exemple.com/foo.png"); width=',
+ expected='background: url("../exemple.com/foo.png"); width=',
+ ),
+ ]
+)
+def invalid_content_inline_with_fallback(request: pytest.FixtureRequest):
+ yield request.param
+
+
+def test_invalid_css_inline_with_fallback(
+ invalid_content_inline_with_fallback: ContentForTests,
+):
+ assert (
+ CssRewriter(
+ ArticleUrlRewriter(
+ article_url=HttpUrl(
+ f"http://{invalid_content_inline_with_fallback.article_url}"
+ )
+ ),
+ base_href=None,
+ ).rewrite_inline(invalid_content_inline_with_fallback.input_str)
+ == invalid_content_inline_with_fallback.expected_str
+ )
+
+
+@pytest.fixture(
+ params=[
+ ContentForTests(input_='"border:', expected=""),
+ ContentForTests(
+ input_="border: solid 1px #c0c0c0; width= 100%",
+ expected="border: solid 1px #c0c0c0; ",
+ ),
+ # Despite being invalid, tinycss parse it as "width" property without value.
+ ContentForTests(input_="width:", expected="width:;"),
+ ContentForTests(
+ input_="border-bottom-width: 1px;border-bottom-color: #c0c0c0;w",
+ expected="border-bottom-width: 1px;border-bottom-color: #c0c0c0;",
+ ),
+ ContentForTests(
+ input_='background: url("http://exemple.com/foo.png"); width=',
+ expected='background: url("../exemple.com/foo.png"); ',
+ ),
+ ]
+)
+def invalid_content_inline_no_fallback(request: pytest.FixtureRequest):
+ yield request.param
+
+
+def test_invalid_css_inline_no_fallback(
+ invalid_content_inline_no_fallback: ContentForTests,
+):
+ assert (
+ CssRewriter(
+ ArticleUrlRewriter(
+ article_url=HttpUrl(
+ f"http://{invalid_content_inline_no_fallback.article_url}"
+ )
+ ),
+ base_href=None,
+ remove_errors=True,
+ ).rewrite_inline(invalid_content_inline_no_fallback.input_str)
+ == invalid_content_inline_no_fallback.expected_str
+ )
+
+
+@pytest.fixture(
+ params=[
+ # Tinycss parse `"border:}` as a string with an unexpected eof in string.
+ # At serialization, tiny try to recover and close the opened rule
+ ContentForTests(input_=b'p {"border:}', expected=b'p {"border:}}'),
+ ContentForTests(input_=b'"p {border:}'),
+ ContentForTests(input_=b"p { border: solid 1px #c0c0c0; width= 100% }"),
+ ContentForTests(input_=b"p { width: }"),
+ ContentForTests(
+ input_=b"p { border-bottom-width: 1px;border-bottom-color: #c0c0c0;w }"
+ ),
+ ContentForTests(
+ input_=b'p { background: url("http://exemple.com/foo.png"); width= }',
+ expected=b'p { background: url("../exemple.com/foo.png"); width= }',
+ ),
+ ]
+)
+def invalid_content(request: pytest.FixtureRequest):
+ yield request.param
+
+
+def test_invalid_cssl(invalid_content: ContentForTests):
+ assert (
+ CssRewriter(
+ ArticleUrlRewriter(
+ article_url=HttpUrl(f"http://{invalid_content.article_url}")
+ ),
+ base_href=None,
+ ).rewrite(invalid_content.input_bytes)
+ == invalid_content.expected_bytes.decode()
+ )
+
+
+def test_rewrite():
+ content = b"""
+/* A comment with a link : http://foo.com */
+@import url(//fonts.googleapis.com/icon?family=Material+Icons);
+
+p, input {
+ color: rbg(1, 2, 3);
+ background: url('http://kiwix.org/super/img');
+ background-image:url('http://exemple.com/no_space_before_url');
+}
+
+@font-face {
+ src: url(https://f.gst.com/s/qa/v31/6xKtdSZaE8KbpRA_hJFQNcOM.woff2) format('woff2');
+}
+
+@media only screen and (max-width: 40em) {
+ p, input {
+ background-image:url(data:image/png;base64,FooContent);
+ }
+}"""
+
+ expected = """
+ /* A comment with a link : http://foo.com */
+ @import url(../fonts.googleapis.com/icon%3Ffamily%3DMaterial%20Icons);
+
+ p, input {
+ color: rbg(1, 2, 3);
+ background: url("super/img");
+ background-image:url("../exemple.com/no_space_before_url");
+ }
+
+ @font-face {
+ src: url(../f.gst.com/s/qa/v31/6xKtdSZaE8KbpRA_hJFQNcOM.woff2) format("woff2");
+ }
+
+ @media only screen and (max-width: 40em) {
+ p, input {
+ background-image:url(data:image/png;base64,FooContent);
+ }
+ }"""
+ expected = dedent(expected)
+
+ assert (
+ CssRewriter(
+ ArticleUrlRewriter(article_url=HttpUrl("http://kiwix.org/article")),
+ base_href=None,
+ ).rewrite(content)
+ == expected
+ )
diff --git a/tests/rewriting/test_html_rewriting.py b/tests/rewriting/test_html_rewriting.py
new file mode 100644
index 00000000..bd59b497
--- /dev/null
+++ b/tests/rewriting/test_html_rewriting.py
@@ -0,0 +1,1555 @@
+from collections.abc import Callable
+from textwrap import dedent
+
+import pytest
+
+from zimscraperlib.rewriting.css import CssRewriter
+from zimscraperlib.rewriting.html import (
+ AttrNameAndValue,
+ AttrsList,
+ HtmlRewriter,
+ HTMLRewritingRules,
+ extract_base_href,
+ format_attr,
+ get_attr_value_from,
+ rewrite_meta_http_equiv_redirect,
+)
+from zimscraperlib.rewriting.js import JsRewriter
+from zimscraperlib.rewriting.url_rewriting import (
+ ArticleUrlRewriter,
+ HttpUrl,
+ ZimPath,
+)
+
+from .utils import ContentForTests
+
+
+@pytest.fixture(
+ params=[
+ ContentForTests(input_="A simple string without url"),
+ ContentForTests(
+ input_=""
+ "
This is a sentence with a http://exemple.com/path link
"
+ ""
+ ),
+ ContentForTests(
+ input_='A link not to rewrite'
+ ),
+ ContentForTests(
+ input_='