organicmaps · newsch · Sep 26, 2023 · Jul 18, 2023 · Jul 25, 2023 · Aug 16, 2023
diff --git a/README.md b/README.md
@@ -10,6 +10,32 @@ OpenStreetMap commonly stores these as [`wikipedia*=`](https://wiki.openstreetma
 [`article_processing_config.json`](article_processing_config.json) should be updated when adding a new language.
 It defines article sections that are not important for users and should be removed from the extracted HTML.
 
+## Downloading Dumps
+
+[Enterprise HTML dumps, updated twice a month, are publicly accessible](https://dumps.wikimedia.org/other/enterprise_html/).
+
+For the wikiparser you'll want the ["NS0"](https://en.wikipedia.org/wiki/Wikipedia:Namespace) "ENTERPRISE-HTML" `.json.tar.gz` files.
+
+They are gzipped tar files containing a single file of newline-delimited JSON matching the [Wikimedia Enterprise API schema](https://enterprise.wikimedia.com/docs/data-dictionary/).
+
+The included [`download.sh`](./download.sh) script handles downloading the latest set of dumps in specific languages.
+It maintains a directory with the following layout:
+```
+<DUMP_DIR>/
+├── latest -> 20230701/
+├── 20230701/
+│  ├── dewiki-NS0-20230701-ENTERPRISE-HTML.json.tar.gz
+│  ├── enwiki-NS0-20230701-ENTERPRISE-HTML.json.tar.gz
+│  ├── eswiki-NS0-20230701-ENTERPRISE-HTML.json.tar.gz
+│  ...
+├── 20230620/
+│  ├── dewiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz
+│  ├── enwiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz
+│  ├── eswiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz
+│  ...
+...
+```
+
 ## Usage
 
 To use with the map generator, see the [`run.sh` script](run.sh) and its own help documentation.

diff --git a/download.sh b/download.sh
@@ -0,0 +1,101 @@
+#! /usr/bin/env bash
+USAGE="Usage: ./download.sh <DUMP_DIR>
+
+Download the latest Wikipedia Enterprise HTML dumps.
+
+Arguments:
+    <DUMP_DIR>  An existing directory to store dumps in. Dumps will be grouped
+                into subdirectories by date, and a link 'latest' will point to
+                the latest complete dump subdirectory, if it exists.
+
+Environment Variables:
+    LANGUAGES   A whitespace-separated list of wikipedia language codes to
+                download dumps of.
+                Defaults to the languages in 'article_processing_config.json'.
+                See <https://meta.wikimedia.org/wiki/List_of_Wikipedias>.
+
+Exit codes:
+    0   The lastest dumps are already present or were downloaded successfully.
+    1   Argument error.
+    16  Some of languages were not available to download. The latest dump may
+        be in progress, or some of the specified languages may not exist.
+    _   Subprocess error.
+"
+
+set -euo pipefail
+# set -x
+
+if [ -z "${1:-}" ]; then
+    echo -n "$USAGE" >&2
+    exit 1
+fi
+
+# The parent directory to store groups of dumps in.
+DUMP_DIR=$(readlink -f "$1")
+shift
+
+if [ ! -d "$DUMP_DIR" ]; then
+    echo "DUMP_DIR '$DUMP_DIR' does not exist" >&2
+    exit 1
+fi
+
+# Ensure we're running in the directory of this script.
+SCRIPT_PATH=$(dirname "$0")
+cd "$SCRIPT_PATH"
+SCRIPT_PATH=$(pwd)
+
+# Only load library after changing to script directory.
+source lib.sh
+
+if [ -z "${LANGUAGES:-}" ]; then
+    # Load languages from config.
+    LANGUAGES=$(jq -r '(.sections_to_remove | keys | .[])' article_processing_config.json)
+fi
+# shellcheck disable=SC2086 # LANGUAGES is intentionally expanded.
+log "Selected languages:" $LANGUAGES
+
+log "Fetching run index"
+# The date of the latest dump, YYYYMMDD.
+LATEST_DUMP=$(wget 'https://dumps.wikimedia.org/other/enterprise_html/runs/' --no-verbose -O - \
+            | grep -Po '(?<=href=")[^"]*' | grep -P '\d{8}' | sort -r | head -n1)
+LATEST_DUMP="${LATEST_DUMP%/}"
+
+log "Checking latest dump $LATEST_DUMP"
+
+URLS=
+MISSING_DUMPS=0
+for lang in $LANGUAGES; do
+    url="https://dumps.wikimedia.org/other/enterprise_html/runs/${LATEST_DUMP}/${lang}wiki-NS0-${LATEST_DUMP}-ENTERPRISE-HTML.json.tar.gz"
+    if ! wget --no-verbose --method=HEAD "$url"; then
+        MISSING_DUMPS=$(( MISSING_DUMPS + 1 ))
+        log "Dump for '$lang' does not exist at '$url'"
+        continue
+    fi
+    URLS="$URLS $url"
+done
+
+if [ -z "$URLS" ]; then
+    log "No dumps available"
+    exit 16
+fi
+
+# The subdir to store the latest dump in.
+DOWNLOAD_DIR="$DUMP_DIR/$LATEST_DUMP"
+if [ ! -e "$DOWNLOAD_DIR" ]; then
+    mkdir "$DOWNLOAD_DIR"
+fi
+
+log "Downloading available dumps"
+# shellcheck disable=SC2086 # URLS should be expanded on spaces.
+wget --directory-prefix "$DOWNLOAD_DIR" --continue $URLS
+
+if [ $MISSING_DUMPS -gt 0 ]; then
+    log "$MISSING_DUMPS dumps not available yet"
+    exit 16
+fi
+
+log "Linking 'latest' to '$LATEST_DUMP'"
+LATEST_LINK="$DUMP_DIR/latest"
+ln -sf "$LATEST_DUMP" "$LATEST_LINK"
+
+# TODO: Remove old dumps?