From 6ef98657f8c4a539ba261b4beb856fe8b8b34e94 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 4 Oct 2025 11:01:01 +0200 Subject: [PATCH 1/5] add end_id paramter to diff processing functions --- src/osmium/replication/server.py | 89 +++++++++++++++++++++----------- test/test_replication.py | 29 ++++++++++- 2 files changed, 88 insertions(+), 30 deletions(-) diff --git a/src/osmium/replication/server.py b/src/osmium/replication/server.py index 64917d5..4f7d89c 100644 --- a/src/osmium/replication/server.py +++ b/src/osmium/replication/server.py @@ -6,7 +6,7 @@ # For a full list of authors see the git log. """ Helper functions to communicate with replication servers. """ -from typing import NamedTuple, Optional, Any, Iterator, cast, Mapping, Tuple +from typing import NamedTuple, Optional, Any, Iterator, cast, Mapping, Tuple, Dict import urllib.request as urlrequest from urllib.error import URLError import datetime as dt @@ -67,7 +67,7 @@ def __init__(self, url: str, diff_type: str = 'osc.gz') -> None: self.baseurl = url self.diff_type = diff_type - self.extra_request_params: dict[str, Any] = dict(timeout=60, stream=True) + self.extra_request_params: Dict[str, Any] = dict(timeout=60, stream=True) self.session: Optional[requests.Session] = None self.retry = Retry(total=3, backoff_factor=0.5, allowed_methods={'GET'}, status_forcelist=[408, 429, 500, 502, 503, 504]) @@ -125,12 +125,19 @@ def _get_url_with_session() -> Iterator[requests.Response]: return _get_url_with_session() - def collect_diffs(self, start_id: int, max_size: int = 1024) -> Optional[DownloadResult]: + def collect_diffs(self, start_id: int, max_size: Optional[int] = None, + end_id: Optional[int] = None) -> Optional[DownloadResult]: """ Create a MergeInputReader and download diffs starting with sequence - id `start_id` into it. `max_size` - restricts the number of diffs that are downloaded. The download - stops as soon as either a diff cannot be downloaded or the - unpacked data in memory exceeds `max_size` kB. + id `start_id` into it. `end_id` optionally gives the highest + sequence number to download. `max_size` restricts the number of + diffs that are downloaded by size. If neither `end_id` nor + `max_size` are given, then download default to stop after 1MB. + + The download stops as soon as + 1. a diff cannot be downloaded or + 2. the end_id (inclusive) is reached or + 3. the unpacked data in memory exceeds `max_size` kB or, + when no `end_id` and `max_size` are given, 1024kB. If some data was downloaded, returns a namedtuple with three fields: `id` contains the sequence id of the last downloaded diff, `reader` @@ -140,19 +147,25 @@ def collect_diffs(self, start_id: int, max_size: int = 1024) -> Optional[Downloa Returns None if there was an error during download or no new data was available. """ - left_size = max_size * 1024 - current_id = start_id - # must not read data newer than the published sequence id # or we might end up reading partial data newest = self.get_state_info() - if newest is None or current_id > newest.sequence: + if newest is None or start_id > newest.sequence: return None + current_id = start_id + left_size: Optional[int] = None + if max_size is not None: + left_size = max_size * 1024 + elif end_id is None: + left_size = 1024 * 1024 + rd = MergeInputReader() - while left_size > 0 and current_id <= newest.sequence: + while (left_size is None or left_size > 0) \ + and (end_id is None or current_id <= end_id) \ + and current_id <= newest.sequence: try: diffdata = self.get_diff_block(current_id) except: # noqa: E722 @@ -163,21 +176,32 @@ def collect_diffs(self, start_id: int, max_size: int = 1024) -> Optional[Downloa return None break - left_size -= rd.add_buffer(diffdata, self.diff_type) - LOG.debug("Downloaded change %d. (%d kB available in download buffer)", - current_id, left_size / 1024) + diff_size = rd.add_buffer(diffdata, self.diff_type) + if left_size is None: + LOG.debug("Downloaded change %d.", current_id) + else: + left_size -= diff_size + LOG.debug("Downloaded change %d. (%d kB available in download buffer)", + current_id, left_size / 1024) current_id += 1 return DownloadResult(current_id - 1, rd, newest.sequence) def apply_diffs(self, handler: BaseHandler, start_id: int, - max_size: int = 1024, idx: str = "", - simplify: bool = True) -> Optional[int]: + max_size: Optional[int] = None, + idx: str = "", simplify: bool = True, + end_id: Optional[int] = None) -> Optional[int]: """ Download diffs starting with sequence id `start_id`, merge them - together and then apply them to handler `handler`. `max_size` - restricts the number of diffs that are downloaded. The download - stops as soon as either a diff cannot be downloaded or the - unpacked data in memory exceeds `max_size` kB. + together and then apply them to handler `handler`. `end_id` + optionally gives the highest sequence id to download. `max_size` + allows to restrict the amount of diffs that are downloaded. + Downloaded diffs are temporarily saved in memory and this parameter + ensures that pyosmium doesn't run out of memory. `max_size` + is the maximum size in kB this internal buffer may have. + + If neither `end_id` nor `max_size` are given, the download is + restricted to a maximum size of 1MB. The download also + stops when the most recent diff has been processed. If `idx` is set, a location cache will be created and applied to the way nodes. You should be aware that diff files usually do not @@ -197,7 +221,7 @@ def apply_diffs(self, handler: BaseHandler, start_id: int, The function returns the sequence id of the last diff that was downloaded or None if the download failed completely. """ - diffs = self.collect_diffs(start_id, max_size) + diffs = self.collect_diffs(start_id, end_id=end_id, max_size=max_size) if diffs is None: return None @@ -206,19 +230,26 @@ def apply_diffs(self, handler: BaseHandler, start_id: int, return diffs.id - def apply_diffs_to_file(self, infile: str, outfile: str, - start_id: int, max_size: int = 1024, + def apply_diffs_to_file(self, infile: str, outfile: str, start_id: int, + max_size: Optional[int] = None, set_replication_header: bool = True, extra_headers: Optional[Mapping[str, str]] = None, - outformat: Optional[str] = None) -> Optional[Tuple[int, int]]: + outformat: Optional[str] = None, + end_id: Optional[int] = None) -> Optional[Tuple[int, int]]: """ Download diffs starting with sequence id `start_id`, merge them with the data from the OSM file named `infile` and write the result into a file with the name `outfile`. The output file must not yet exist. - `max_size` restricts the number of diffs that are downloaded. The - download stops as soon as either a diff cannot be downloaded or the - unpacked data in memory exceeds `max_size` kB. + `end_id` optionally gives the highest sequence id to download. + `max_size` allows to restrict the amount of diffs that are + downloaded. Downloaded diffs are saved in memory and this parameter + ensures that pyosmium doesn't run out of memory. `max_size` + is the maximum size in kB this internal buffer may have. + + If neither `end_id` nor `max_size` are given, the + download is restricted to a maximum size of 1MB. The download also + stops when the most recent diff has been processed. If `set_replication_header` is true then the URL of the replication server and the sequence id and timestamp of the last diff applied @@ -235,7 +266,7 @@ def apply_diffs_to_file(self, infile: str, outfile: str, newest available sequence id if new data has been written or None if no data was available or the download failed completely. """ - diffs = self.collect_diffs(start_id, max_size) + diffs = self.collect_diffs(start_id, end_id=end_id, max_size=max_size) if diffs is None: return None diff --git a/test/test_replication.py b/test/test_replication.py index 9fc903e..538a4cb 100644 --- a/test/test_replication.py +++ b/test/test_replication.py @@ -13,7 +13,7 @@ from werkzeug.wrappers import Response -from helpers import mkdate, CountingHandler +from helpers import mkdate, CountingHandler, IDCollector import osmium.replication.server as rserv import osmium.replication @@ -223,6 +223,33 @@ def test_apply_diffs_count(httpserver): assert h.counts == [1, 1, 1, 0] +@pytest.mark.parametrize('end_id,max_size, actual_end', [(107, None, 107), + (None, 512, 108), + (105, 512, 105), + (110, 512, 108), + (None, None, 115)]) +def test_apply_diffs_endid(httpserver, end_id, max_size, actual_end): + httpserver.expect_request('/state.txt').respond_with_data("""\ + sequenceNumber=140 + timestamp=2017-08-26T11\\:04\\:02Z + """) + for i in range(100, 141): + httpserver.expect_request(f'/000/000/{i}.opl')\ + .respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 3000))) + + with rserv.ReplicationServer(httpserver.url_for(''), "opl") as svr: + res = svr.collect_diffs(101, end_id=end_id, max_size=max_size) + + assert res is not None + assert res.id == actual_end + assert res.newest == 140 + + ids = IDCollector() + res.reader.apply(ids) + + assert ids.relations == list(range(101, actual_end + 1)) + + def test_apply_diffs_without_simplify(httpserver): httpserver.expect_ordered_request('/state.txt').respond_with_data("""\ sequenceNumber=100 From a2f160627470a37514c34844ed752add9b2334db Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 4 Oct 2025 12:06:57 +0200 Subject: [PATCH 2/5] add end date to pyosmium-get-changes --- src/osmium/tools/pyosmium_get_changes.py | 81 ++++++++++++++++++------ test/test_pyosmium_get_changes.py | 30 +++++++++ 2 files changed, 91 insertions(+), 20 deletions(-) diff --git a/src/osmium/tools/pyosmium_get_changes.py b/src/osmium/tools/pyosmium_get_changes.py index d747c55..156a039 100644 --- a/src/osmium/tools/pyosmium_get_changes.py +++ b/src/osmium/tools/pyosmium_get_changes.py @@ -1,3 +1,9 @@ +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of pyosmium. (https://osmcode.org/pyosmium/) +# +# Copyright (C) 2025 Sarah Hoffmann and others. +# For a full list of authors see the git log. """ Fetch diffs from an OSM planet server. @@ -23,12 +29,14 @@ However, it can read cookies from a Netscape-style cookie jar file, send these cookies to the server and will save received cookies to the jar file. """ +from typing import Optional, List import sys import logging from textwrap import dedent as msgfmt from argparse import ArgumentParser, RawDescriptionHelpFormatter, ArgumentTypeError import datetime as dt +from dataclasses import dataclass import http.cookiejar from osmium.replication import server as rserv @@ -40,25 +48,34 @@ log = logging.getLogger() -class ReplicationStart(object): +@dataclass +class ReplicationStart: """ Represents the point where changeset download should begin. """ + date: Optional[dt.datetime] = None + seq_id: Optional[int] = None + source: Optional[str] = None - def __init__(self, date=None, seq_id=None, src=None): - self.date = date - self.seq_id = seq_id - self.source = src - - def get_sequence(self, svr): + def get_sequence(self, svr: rserv.ReplicationServer) -> Optional[int]: if self.seq_id is not None: log.debug("Using given sequence ID %d" % self.seq_id) return self.seq_id + 1 + assert self.date is not None log.debug("Looking up sequence ID for timestamp %s" % self.date) return svr.timestamp_to_sequence(self.date) + def get_end_sequence(self, svr: rserv.ReplicationServer) -> Optional[int]: + if self.seq_id is not None: + log.debug("Using end sequence ID %d" % self.seq_id) + return self.seq_id + + assert self.date is not None + log.debug("Looking up end sequence ID for timestamp %s" % self.date) + return svr.timestamp_to_sequence(self.date) + @staticmethod - def from_id(idstr): + def from_id(idstr: str) -> 'ReplicationStart': try: seq_id = int(idstr) except ValueError: @@ -70,7 +87,7 @@ def from_id(idstr): return ReplicationStart(seq_id=seq_id) @staticmethod - def from_date(datestr): + def from_date(datestr: str) -> 'ReplicationStart': try: date = dt.datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%SZ") date = date.replace(tzinfo=dt.timezone.utc) @@ -81,7 +98,7 @@ def from_date(datestr): return ReplicationStart(date=date) @staticmethod - def from_osm_file(fname, ignore_headers): + def from_osm_file(fname: str, ignore_headers: bool) -> 'ReplicationStart': if ignore_headers: ts = None seq = None @@ -102,10 +119,10 @@ def from_osm_file(fname, ignore_headers): if ts is None: raise ArgumentTypeError("OSM file does not seem to contain valid data.") - return ReplicationStart(seq_id=seq, date=ts, src=url) + return ReplicationStart(seq_id=seq, date=ts, source=url) -def write_end_sequence(fname, seqid): +def write_end_sequence(fname: str, seqid: int) -> None: """Either writes out the sequence file or prints the sequence id to stdout. """ if fname is None: @@ -115,7 +132,7 @@ def write_end_sequence(fname, seqid): fd.write(str(seqid)) -def get_arg_parser(from_main=False): +def get_arg_parser(from_main: bool = False) -> ArgumentParser: parser = ArgumentParser(prog='pyosmium-get-changes', description=__doc__, usage=None if from_main else 'pyosmium-get-changes [options]', @@ -134,8 +151,9 @@ def get_arg_parser(from_main=False): parser.add_argument('--cookie', dest='cookie', help='Netscape-style cookie jar file to read cookies from ' 'and where received cookies will be written to.') - parser.add_argument('-s', '--size', dest='outsize', type=int, default=100, - help='Maximum data to load in MB (default: 100MB).') + parser.add_argument('-s', '--size', dest='outsize', type=int, + help='Maximum data to load in MB ' + '(Defaults to 100MB when no end date/ID has been set).') group = parser.add_mutually_exclusive_group() group.add_argument('-I', '--start-id', dest='start', type=ReplicationStart.from_id, metavar='ID', @@ -145,6 +163,13 @@ def get_arg_parser(from_main=False): help='Date when to start updates') group.add_argument('-O', '--start-osm-data', dest='start_file', metavar='OSMFILE', help='start at the date of the newest OSM object in the file') + group = parser.add_mutually_exclusive_group() + group.add_argument('--end-id', dest='end', + type=ReplicationStart.from_id, metavar='ID', + help='Last sequence ID to download.') + group.add_argument('-E', '--end-date', dest='end', metavar='DATE', + type=ReplicationStart.from_date, + help='Do not download diffs later than the given date.') parser.add_argument('-f', '--sequence-file', dest='seq_file', help='Sequence file. If the file exists, then updates ' 'will start after the id given in the file. At the ' @@ -164,7 +189,7 @@ def get_arg_parser(from_main=False): return parser -def pyosmium_get_changes(args): +def pyosmium_get_changes(args: List[str]) -> int: logging.basicConfig(stream=sys.stderr, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') @@ -223,14 +248,30 @@ def pyosmium_get_changes(args): write_end_sequence(options.seq_file, startseq - 1) return 0 - log.debug("Starting download at ID %d (max %d MB)" % (startseq, options.outsize)) + log.debug("Starting download at ID %d (max %f MB)" + % (startseq, options.outsize or float('inf'))) if options.outformat is not None: outhandler = SimpleWriter(options.outfile, filetype=options.outformat) else: outhandler = SimpleWriter(options.outfile) - endseq = svr.apply_diffs(outhandler, startseq, max_size=options.outsize*1024, - simplify=options.simplify) + if options.outsize is not None: + max_size = options.outsize * 1024 + elif options.end is None: + max_size = 100 * 1024 + else: + max_size = None + + if options.end is None: + end_id = None + else: + end_id = options.end.get_end_sequence(svr) + if end_id is None: + log.error("Cannot find the end date/ID on the server.") + return 1 + + endseq = svr.apply_diffs(outhandler, startseq, max_size=max_size, + end_id=end_id, simplify=options.simplify) outhandler.close() # save cookies @@ -247,7 +288,7 @@ def pyosmium_get_changes(args): return 0 -def main(): +def main() -> int: logging.basicConfig(stream=sys.stderr, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') diff --git a/test/test_pyosmium_get_changes.py b/test/test_pyosmium_get_changes.py index 2998121..a8c5a77 100644 --- a/test/test_pyosmium_get_changes.py +++ b/test/test_pyosmium_get_changes.py @@ -9,6 +9,8 @@ from textwrap import dedent import uuid +import pytest + import osmium.replication.server import osmium from osmium.tools.pyosmium_get_changes import pyosmium_get_changes @@ -105,3 +107,31 @@ def test_get_simple_update(self, tmp_path, httpserver): assert ids.nodes == [12, 13] assert ids.ways == [2] assert ids.relations == [] + + @pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107), + (None, 1, 108), + (105, 1, 105), + (110, 1, 108)]) + def test_apply_diffs_endid(self, tmp_path, httpserver, end_id, max_size, actual_end): + outfile = tmp_path / f"{uuid.uuid4()}.opl" + + httpserver.expect_request('/state.txt').respond_with_data("""\ + sequenceNumber=140 + timestamp=2017-08-26T11\\:04\\:02Z + """) + for i in range(100, 141): + httpserver.expect_request(f'/000/000/{i}.opl')\ + .respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000))) + + params = [httpserver, '--diff-type', 'opl', '-I', '100', '-o', str(outfile)] + if end_id is not None: + params.extend(('--end-id', str(end_id))) + if max_size is not None: + params.extend(('-s', str(max_size))) + + assert 0 == self.main(*params) + + ids = IDCollector() + osmium.apply(str(outfile), ids) + + assert ids.relations == list(range(101, actual_end + 1)) From a4d2f5619e7e3700d20d14ff4286151a715a7734 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 4 Oct 2025 18:03:49 +0200 Subject: [PATCH 3/5] add parameter for end date/ID to pyosmium_up_to_date Also moves some code that is shared between the two tools into a separate file. --- pyproject.toml | 3 + src/osmium/tools/common.py | 92 ++++++++++++ src/osmium/tools/pyosmium_get_changes.py | 91 +----------- src/osmium/tools/pyosmium_up_to_date.py | 167 ++++++++++++---------- test/test_pyosmium_get_changes.py | 1 - test/test_pyosmium_up-to-date.py | 170 +++++++++++++++++++++++ 6 files changed, 369 insertions(+), 155 deletions(-) create mode 100644 src/osmium/tools/common.py create mode 100644 test/test_pyosmium_up-to-date.py diff --git a/pyproject.toml b/pyproject.toml index d7a54a1..0916563 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,3 +101,6 @@ include = ['/src/**/*.py', '/contrib/protozero/LICENSE', '/contrib/protozero/README.md', ] + +[tool.pytest.ini_options] +log_cli = false diff --git a/src/osmium/tools/common.py b/src/osmium/tools/common.py new file mode 100644 index 0000000..aba1346 --- /dev/null +++ b/src/osmium/tools/common.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of pyosmium. (https://osmcode.org/pyosmium/) +# +# Copyright (C) 2025 Sarah Hoffmann and others. +# For a full list of authors see the git log. +from typing import Optional +import logging +from dataclasses import dataclass +import datetime as dt +from argparse import ArgumentTypeError + +from ..replication import newest_change_from_file +from ..replication.server import ReplicationServer +from ..replication.utils import get_replication_header + + +log = logging.getLogger() + + +@dataclass +class ReplicationStart: + """ Represents the point where changeset download should begin. + """ + date: Optional[dt.datetime] = None + seq_id: Optional[int] = None + source: Optional[str] = None + + def get_sequence(self, svr: ReplicationServer) -> Optional[int]: + if self.seq_id is not None: + log.debug("Using given sequence ID %d" % self.seq_id) + return self.seq_id + 1 + + assert self.date is not None + log.debug("Looking up sequence ID for timestamp %s" % self.date) + return svr.timestamp_to_sequence(self.date) + + def get_end_sequence(self, svr: ReplicationServer) -> Optional[int]: + if self.seq_id is not None: + log.debug("Using end sequence ID %d" % self.seq_id) + return self.seq_id + + assert self.date is not None + log.debug("Looking up end sequence ID for timestamp %s" % self.date) + return svr.timestamp_to_sequence(self.date) + + @staticmethod + def from_id(idstr: str) -> 'ReplicationStart': + try: + seq_id = int(idstr) + except ValueError: + raise ArgumentTypeError("Sequence id '%s' is not a number" % idstr) + + if seq_id < -1: + raise ArgumentTypeError("Sequence id '%s' is negative" % idstr) + + return ReplicationStart(seq_id=seq_id) + + @staticmethod + def from_date(datestr: str) -> 'ReplicationStart': + try: + date = dt.datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%SZ") + date = date.replace(tzinfo=dt.timezone.utc) + except ValueError: + raise ArgumentTypeError( + "Date needs to be in ISO8601 format (e.g. 2015-12-24T08:08:08Z).") + + return ReplicationStart(date=date) + + @staticmethod + def from_osm_file(fname: str, ignore_headers: bool) -> 'ReplicationStart': + if ignore_headers: + ts = None + seq = None + url = None + else: + try: + (url, seq, ts) = get_replication_header(fname) + except RuntimeError as e: + raise ArgumentTypeError(e) + + if ts is None and seq is None: + log.debug("OSM file has no replication headers. Looking for newest OSM object.") + try: + ts = newest_change_from_file(fname) + except RuntimeError as e: + raise ArgumentTypeError(e) + + if ts is None: + raise ArgumentTypeError("OSM file does not seem to contain valid data.") + + return ReplicationStart(seq_id=seq, date=ts, source=url) diff --git a/src/osmium/tools/pyosmium_get_changes.py b/src/osmium/tools/pyosmium_get_changes.py index 156a039..2e4f7dc 100644 --- a/src/osmium/tools/pyosmium_get_changes.py +++ b/src/osmium/tools/pyosmium_get_changes.py @@ -29,97 +29,20 @@ However, it can read cookies from a Netscape-style cookie jar file, send these cookies to the server and will save received cookies to the jar file. """ -from typing import Optional, List +from typing import List import sys import logging from textwrap import dedent as msgfmt - -from argparse import ArgumentParser, RawDescriptionHelpFormatter, ArgumentTypeError -import datetime as dt -from dataclasses import dataclass +from argparse import ArgumentParser, RawDescriptionHelpFormatter import http.cookiejar -from osmium.replication import server as rserv -from osmium.replication import newest_change_from_file -from osmium.replication.utils import get_replication_header -from osmium.version import pyosmium_release -from osmium import SimpleWriter - -log = logging.getLogger() +from ..replication import server as rserv +from ..version import pyosmium_release +from .. import SimpleWriter +from .common import ReplicationStart -@dataclass -class ReplicationStart: - """ Represents the point where changeset download should begin. - """ - date: Optional[dt.datetime] = None - seq_id: Optional[int] = None - source: Optional[str] = None - - def get_sequence(self, svr: rserv.ReplicationServer) -> Optional[int]: - if self.seq_id is not None: - log.debug("Using given sequence ID %d" % self.seq_id) - return self.seq_id + 1 - - assert self.date is not None - log.debug("Looking up sequence ID for timestamp %s" % self.date) - return svr.timestamp_to_sequence(self.date) - - def get_end_sequence(self, svr: rserv.ReplicationServer) -> Optional[int]: - if self.seq_id is not None: - log.debug("Using end sequence ID %d" % self.seq_id) - return self.seq_id - - assert self.date is not None - log.debug("Looking up end sequence ID for timestamp %s" % self.date) - return svr.timestamp_to_sequence(self.date) - - @staticmethod - def from_id(idstr: str) -> 'ReplicationStart': - try: - seq_id = int(idstr) - except ValueError: - raise ArgumentTypeError("Sequence id '%s' is not a number" % idstr) - - if seq_id < -1: - raise ArgumentTypeError("Sequence id '%s' is negative" % idstr) - - return ReplicationStart(seq_id=seq_id) - - @staticmethod - def from_date(datestr: str) -> 'ReplicationStart': - try: - date = dt.datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%SZ") - date = date.replace(tzinfo=dt.timezone.utc) - except ValueError: - raise ArgumentTypeError( - "Date needs to be in ISO8601 format (e.g. 2015-12-24T08:08:08Z).") - - return ReplicationStart(date=date) - - @staticmethod - def from_osm_file(fname: str, ignore_headers: bool) -> 'ReplicationStart': - if ignore_headers: - ts = None - seq = None - url = None - else: - try: - (url, seq, ts) = get_replication_header(fname) - except RuntimeError as e: - raise ArgumentTypeError(e) - - if ts is None and seq is None: - log.debug("OSM file has no replication headers. Looking for newest OSM object.") - try: - ts = newest_change_from_file(fname) - except RuntimeError as e: - raise ArgumentTypeError(e) - - if ts is None: - raise ArgumentTypeError("OSM file does not seem to contain valid data.") - - return ReplicationStart(seq_id=seq, date=ts, source=url) +log = logging.getLogger() def write_end_sequence(fname: str, seqid: int) -> None: diff --git a/src/osmium/tools/pyosmium_up_to_date.py b/src/osmium/tools/pyosmium_up_to_date.py index 74aaf5b..53424b5 100644 --- a/src/osmium/tools/pyosmium_up_to_date.py +++ b/src/osmium/tools/pyosmium_up_to_date.py @@ -1,3 +1,9 @@ +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of pyosmium. (https://osmcode.org/pyosmium/) +# +# Copyright (C) 2025 Sarah Hoffmann and others. +# For a full list of authors see the git log. """ Update an OSM file with changes from a OSM replication server. @@ -29,38 +35,42 @@ However, it can read cookies from a Netscape-style cookie jar file, send these cookies to the server and will save received cookies to the jar file. """ +from typing import Any, List import sys import traceback import logging import http.cookiejar -from argparse import ArgumentParser, RawDescriptionHelpFormatter +from argparse import ArgumentParser, RawDescriptionHelpFormatter, ArgumentTypeError import datetime as dt -from osmium.replication import server as rserv -from osmium.replication.utils import get_replication_header -from osmium.replication import newest_change_from_file -from osmium.version import pyosmium_release from textwrap import dedent as msgfmt from tempfile import mktemp import os.path +from ..replication import server as rserv +from ..version import pyosmium_release +from .common import ReplicationStart + log = logging.getLogger() -def update_from_osm_server(ts, options): - """Update the OSM file using the official OSM servers at - https://planet.osm.org/replication. This strategy will attempt - to start with daily updates before going down to minutelies. - TODO: only updates from hourlies currently implemented. +def update_from_osm_server(start: ReplicationStart, options: Any) -> int: + """ Update the OSM file using the official OSM servers at + https://planet.osm.org/replication. This strategy will attempt + to start with daily updates before going down to minutelies. + TODO: only updates from hourlies currently implemented. """ - return update_from_custom_server("https://planet.osm.org/replication/hour/", - None, ts, options) + start.source = "https://planet.osm.org/replication/hour/" + return update_from_custom_server(start, options) + +def update_from_custom_server(start: ReplicationStart, options: Any) -> int: + """ Update from a custom URL, simply using the diff sequence as is. + """ + assert start.source -def update_from_custom_server(url, seq, ts, options): - """Update from a custom URL, simply using the diff sequence as is.""" - with rserv.ReplicationServer(url, "osc.gz") as svr: - log.info("Using replication service at %s", url) + with rserv.ReplicationServer(start.source, options.server_diff_type) as svr: + log.info(f"Using replication service at {start.source}") svr.set_request_parameter('timeout', options.socket_timeout or None) @@ -73,39 +83,37 @@ def update_from_custom_server(url, seq, ts, options): if current is None: log.error("Cannot download state information. Is the replication URL correct?") return 3 - log.debug("Server is at sequence %d (%s).", current.sequence, current.timestamp) - - if seq is None: - log.info("Using timestamp %s as starting point." % ts) - startseq = svr.timestamp_to_sequence(ts) - if startseq is None: - log.error("No starting point found for time %s on server %s" - % (str(ts), url)) - return 3 - else: - if seq >= current.sequence: - log.info("File is already up to date.") - return 0 - - log.debug("Using given sequence ID %d" % seq) - startseq = seq + 1 - ts = svr.get_state_info(seq=startseq) - if ts is None: - log.error("Cannot download state information for ID %d. Is the URL correct?" % seq) + log.debug(f"Server is at sequence {current.sequence} ({current.timestamp}).") + + if start.seq_id is not None and start.seq_id >= current.sequence: + log.info("File is already up to date.") + return 0 + + startseq = start.get_sequence(svr) + if startseq is None: + log.error(f"No starting point found for time {start.date} on server {start.source}") + return 3 + + if start.date is None: + start_state = svr.get_state_info(seq=startseq) + if start_state is None: + log.error(f"Cannot download state information for ID {startseq}. " + 'Is the URL correct?') return 3 - ts = ts.timestamp + start.date = start_state.timestamp if not options.force_update: cmpdate = dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=90) cmpdate = cmpdate.replace(tzinfo=dt.timezone.utc) - if ts < cmpdate: + if start.date < cmpdate: log.error( """The OSM file is more than 3 months old. You should download a more recent file instead of updating. If you really want to update the file, use --force-update-of-old-planet.""") return 3 - log.info("Starting download at ID %d (max %d MB)" % (startseq, options.outsize)) + log.info("Starting download at ID %d (max %f MB)" + % (startseq, options.outsize or float('inf'))) outfile = options.outfile infile = options.infile @@ -118,10 +126,25 @@ def update_from_custom_server(url, seq, ts, options): else: ofname = outfile + if options.outsize is not None: + max_size = options.outsize * 1024 + elif options.end is None: + max_size = 1024 * 1024 + else: + max_size = None + + if options.end is None: + end_id = None + else: + end_id = options.end.get_end_sequence(svr) + if end_id is None: + log.error("Cannot find the end date/ID on the server.") + return 1 + try: extra_headers = {'generator': 'pyosmium-up-to-date/' + pyosmium_release} outseqs = svr.apply_diffs_to_file(infile, ofname, startseq, - max_size=options.outsize*1024, + max_size=max_size, end_id=end_id, extra_headers=extra_headers, outformat=options.outformat) @@ -144,41 +167,35 @@ def update_from_custom_server(url, seq, ts, options): if options.cookie: cookie_jar.save(options.cookie) - return 0 if outseqs[1] == outseqs[0] else 1 + return 0 if (end_id or outseqs[1]) == outseqs[0] else 1 -def compute_start_point(options): - if options.ignore_headers: - url, seq, ts = None, None, None - else: - url, seq, ts = get_replication_header(options.infile) +def compute_start_point(options: Any) -> ReplicationStart: + start = ReplicationStart.from_osm_file(options.infile, options.ignore_headers) if options.server_url is not None: - if url is not None and url != options.server_url: + if start.source is not None and start.source != options.server_url: log.error(msgfmt(f""" You asked to use server URL: {options.server_url} but the referenced OSM file points to replication server: - {url} + {start.source} If you really mean to overwrite the URL, use --ignore-osmosis-headers.""")) - exit(2) - url = options.server_url - - if seq is None and ts is None: - log.info("No replication information found, scanning for newest OSM object.") - ts = newest_change_from_file(options.infile) - - if ts is None: - log.error("OSM file does not seem to contain valid data.") - exit(2) + raise ArgumentTypeError("Source URL doesn't match replication headers.") + if start.source is None: + start.source = options.server_url + start.seq_id = None + if start.date is None: + raise ArgumentTypeError("Cannot determine start date for file.") - if ts is not None: - ts -= dt.timedelta(minutes=options.wind_back) + if start.seq_id is None: + assert start.date is not None + start.date -= dt.timedelta(minutes=options.wind_back) - return url, seq, ts + return start -def get_arg_parser(from_main=False): +def get_arg_parser(from_main: bool = False) -> ArgumentParser: parser = ArgumentParser(prog='pyosmium-up-to-date', description=__doc__, @@ -197,8 +214,18 @@ def get_arg_parser(from_main=False): help='Base URL of the replication server. Default: ' 'https://planet.osm.org/replication/hour/ ' '(hourly diffs from osm.org)') - parser.add_argument('-s', '--size', dest='outsize', metavar='SIZE', type=int, default=1024, - help='Maximum size of change to apply at once in MB. Default: 1GB') + parser.add_argument('--diff-type', action='store', dest='server_diff_type', default='osc.gz', + help='File format used by the replication server (default: osc.gz)') + parser.add_argument('-s', '--size', dest='outsize', metavar='SIZE', type=int, + help='Maximum size of change to apply at once in MB. ' + 'Defaults to 1GB when no end ID or date was given.') + group = parser.add_mutually_exclusive_group() + group.add_argument('--end-id', dest='end', + type=ReplicationStart.from_id, metavar='ID', + help='Last sequence ID to download.') + group.add_argument('-E', '--end-date', dest='end', metavar='DATE', + type=ReplicationStart.from_date, + help='Do not download diffs later than the given date.') parser.add_argument('--tmpdir', dest='tmpdir', help='Directory to use for temporary files. ' 'Usually the directory of input file is used.') @@ -225,28 +252,28 @@ def get_arg_parser(from_main=False): return parser -def pyosmium_up_to_date(args): - options = get_arg_parser(from_main=True).parse_args() +def pyosmium_up_to_date(args: List[str]) -> int: + options = get_arg_parser(from_main=True).parse_args(args) log.setLevel(max(3 - options.loglevel, 0) * 10) try: - url, seq, ts = compute_start_point(options) + start = compute_start_point(options) except RuntimeError as e: log.error(str(e)) return 2 try: - if url is None: - return update_from_osm_server(ts, options) + if start.source is None: + return update_from_osm_server(start, options) - return update_from_custom_server(url, seq, ts, options) + return update_from_custom_server(start, options) except Exception: traceback.print_exc() return 254 -def main(): +def main() -> int: logging.basicConfig(stream=sys.stderr, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') diff --git a/test/test_pyosmium_get_changes.py b/test/test_pyosmium_get_changes.py index a8c5a77..ca2044b 100644 --- a/test/test_pyosmium_get_changes.py +++ b/test/test_pyosmium_get_changes.py @@ -11,7 +11,6 @@ import pytest -import osmium.replication.server import osmium from osmium.tools.pyosmium_get_changes import pyosmium_get_changes diff --git a/test/test_pyosmium_up-to-date.py b/test/test_pyosmium_up-to-date.py new file mode 100644 index 0000000..4c7706a --- /dev/null +++ b/test/test_pyosmium_up-to-date.py @@ -0,0 +1,170 @@ +# SPDX-License-Identifier: BSD-2-Clause +# +# This file is part of pyosmium. (https://osmcode.org/pyosmium/) +# +# Copyright (C) 2025 Sarah Hoffmann and others. +# For a full list of authors see the git log. +""" Tests for the pyosmium-up-to-date script. +""" +import uuid +import datetime as dt + +import pytest +import osmium +from osmium.tools.pyosmium_up_to_date import pyosmium_up_to_date +import osmium.replication.utils as rutil + +from helpers import IDCollector + +# Choosing a future date here, so we don't run into pyosmium's check for old +# data. If you get caught by this: congratulations, you are maintaining a +# 50-year old test. +REPLICATION_BASE_TIME = dt.datetime(year=2070, month=5, day=6, hour=20, tzinfo=dt.timezone.utc) +REPLICATION_BASE_SEQ = 100 +REPLICATION_CURRENT = 140 + + +@pytest.fixture +def replication_server(httpserver): + def _state(seq): + seqtime = REPLICATION_BASE_TIME + dt.timedelta(hours=seq - REPLICATION_CURRENT) + timestamp = seqtime.strftime('%Y-%m-%dT%H\\:%M\\:%SZ') + return f"sequenceNumber={seq}\ntimestamp={timestamp}\n" + + httpserver.no_handler_status_code = 404 + httpserver.expect_request('/state.txt').respond_with_data(_state(REPLICATION_CURRENT)) + for i in range(REPLICATION_BASE_SEQ, REPLICATION_CURRENT + 1): + httpserver.expect_request(f'/000/000/{i}.opl')\ + .respond_with_data(f"r{i} M" + ",".join(f"n{i}@" for i in range(1, 6000))) + httpserver.expect_request(f'/000/000/{i}.state.txt').respond_with_data(_state(i)) + + return httpserver.url_for('') + + +@pytest.fixture +def runner(replication_server): + def _run(*args): + return pyosmium_up_to_date( + ['--server', replication_server, '--diff-type', 'opl'] + list(map(str, args))) + + return _run + + +def test_no_output_file(runner): + with pytest.raises(SystemExit): + runner() + + +def test_simple_update_no_windback(runner, test_data): + outfile = test_data("n1 v1 t2070-05-06T19:30:00Z") + + assert 0 == runner('--wind-back', 0, outfile) + + ids = IDCollector() + osmium.apply(outfile, ids) + + assert ids.nodes == [1] + assert ids.relations == list(range(139, REPLICATION_CURRENT + 1)) + + +def test_simple_update_override(runner, test_data): + outfile = test_data("n1 v1 t2070-05-06T19:30:00Z") + + assert 0 == runner(outfile) + + ids = IDCollector() + osmium.apply(outfile, ids) + + assert ids.nodes == [1] + assert ids.relations == list(range(138, REPLICATION_CURRENT + 1)) + + +def test_simple_update_new_file(runner, replication_server, test_data, tmp_path): + outfile = test_data("n1 v1 t2070-05-06T19:30:00Z") + newfile = tmp_path / f"{uuid.uuid4()}.pbf" + + assert 0 == runner('-o', str(newfile), outfile) + + ids = IDCollector() + osmium.apply(outfile, ids) + + assert ids.nodes == [1] + assert ids.relations == [] + + ids = IDCollector() + osmium.apply(newfile, ids) + assert ids.nodes == [1] + assert ids.relations == list(range(138, REPLICATION_CURRENT + 1)) + + header = rutil.get_replication_header(newfile) + + assert header.url == replication_server + assert header.sequence == REPLICATION_CURRENT + assert header.timestamp == REPLICATION_BASE_TIME + + +def test_update_sequences(runner, test_data, tmp_path): + outfile = test_data("n1 v1 t2070-05-05T10:30:00Z") + newfile = tmp_path / f"{uuid.uuid4()}.pbf" + + assert 0 == runner('--end-id', '110', '-o', str(newfile), outfile) + + ids = IDCollector() + osmium.apply(newfile, ids) + assert ids.nodes == [1] + assert ids.relations == list(range(105, 111)) + + header = rutil.get_replication_header(newfile) + + assert header.sequence == 110 + + # Note: this test only catches holes, no duplicate application. + assert 0 == runner(newfile) + + ids = IDCollector() + osmium.apply(newfile, ids) + assert ids.nodes == [1] + assert ids.relations == list(range(105, REPLICATION_CURRENT + 1)) + + header = rutil.get_replication_header(newfile) + + assert header.sequence == REPLICATION_CURRENT + + +@pytest.mark.parametrize('end_id,max_size,actual_end', [(107, None, 107), + (None, 1, 108), + (105, 1, 105), + (110, 1, 108)]) +def test_update_with_endid(test_data, runner, end_id, max_size, actual_end): + outfile = test_data("n1 v1 t2070-05-05T06:30:00Z") + + params = [outfile] + if end_id is not None: + params.extend(('--end-id', end_id)) + if max_size is not None: + params.extend(('-s', max_size)) + + assert (0 if end_id == actual_end else 1) == runner(*params) + + ids = IDCollector() + osmium.apply(outfile, ids) + + assert ids.relations == list(range(101, actual_end + 1)) + + +def test_update_with_enddate(test_data, runner, tmp_path): + outfile = test_data("n1 v1 t2070-05-05T06:30:00Z") + newfile = tmp_path / f"{uuid.uuid4()}.pbf" + + assert 0 == runner('-E', '2070-05-05T09:30:00Z', '-o', newfile, outfile) + + header = rutil.get_replication_header(newfile) + + assert header.sequence == 105 + assert header.timestamp == dt.datetime(year=2070, month=5, day=5, hour=9, + tzinfo=dt.timezone.utc) + + ids = IDCollector() + osmium.apply(newfile, ids) + + assert ids.relations == list(range(101, 106)) From 79d33d947463afe6f88448933b85c4a1bc2e0191 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 5 Oct 2025 08:38:33 +0200 Subject: [PATCH 4/5] build man pages against installed osmium, not against source --- docs/Makefile | 4 ++-- docs/man/pyosmium-get-changes.1 | 12 ++++++++++-- docs/man/pyosmium-up-to-date.1 | 16 ++++++++++++++-- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 56d8dd0..8ecb11d 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,7 +2,7 @@ ARGPARSE_BASEARGS=--author 'Sarah Hoffmann' --author-email 'lonvia@denofr.de' -- man: mkdir -p man - argparse-manpage --pyfile ../src/osmium/tools/pyosmium_get_changes.py --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-get-changes.1 - argparse-manpage --pyfile ../src/osmium/tools/pyosmium_up_to_date.py --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-up-to-date.1 + argparse-manpage --module osmium.tools.pyosmium_get_changes --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-get-changes.1 + argparse-manpage --module osmium.tools.pyosmium_up_to_date --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-up-to-date.1 .PHONY: man diff --git a/docs/man/pyosmium-get-changes.1 b/docs/man/pyosmium-get-changes.1 index 7df3fa7..4245e3e 100644 --- a/docs/man/pyosmium-get-changes.1 +++ b/docs/man/pyosmium-get-changes.1 @@ -1,4 +1,4 @@ -.TH PYOSMIUM\-GET\-CHANGES "1" "2025\-09\-09" "pyosmium" "Generated Python Manual" +.TH PYOSMIUM\-GET\-CHANGES "1" "2025\-10\-05" "pyosmium" "Generated Python Manual" .SH NAME pyosmium\-get\-changes .SH SYNOPSIS @@ -56,7 +56,7 @@ Netscape\-style cookie jar file to read cookies from and where received cookies .TP \fB\-s\fR \fI\,OUTSIZE\/\fR, \fB\-\-size\fR \fI\,OUTSIZE\/\fR -Maximum data to load in MB (default: 100MB). +Maximum data to load in MB (Defaults to 100MB when no end date/ID has been set). .TP \fB\-I\fR \fI\,ID\/\fR, \fB\-\-start\-id\fR \fI\,ID\/\fR @@ -70,6 +70,14 @@ Date when to start updates \fB\-O\fR \fI\,OSMFILE\/\fR, \fB\-\-start\-osm\-data\fR \fI\,OSMFILE\/\fR start at the date of the newest OSM object in the file +.TP +\fB\-\-end\-id\fR \fI\,ID\/\fR +Last sequence ID to download. + +.TP +\fB\-E\fR \fI\,DATE\/\fR, \fB\-\-end\-date\fR \fI\,DATE\/\fR +Do not download diffs later than the given date. + .TP \fB\-f\fR \fI\,SEQ_FILE\/\fR, \fB\-\-sequence\-file\fR \fI\,SEQ_FILE\/\fR Sequence file. If the file exists, then updates will start after the id given in the file. At the end of the process, the last sequence ID contained in the diff is written. diff --git a/docs/man/pyosmium-up-to-date.1 b/docs/man/pyosmium-up-to-date.1 index 00650ec..e5a9bda 100644 --- a/docs/man/pyosmium-up-to-date.1 +++ b/docs/man/pyosmium-up-to-date.1 @@ -1,4 +1,4 @@ -.TH PYOSMIUM\-UP\-TO\-DATE "1" "2025\-09\-09" "pyosmium" "Generated Python Manual" +.TH PYOSMIUM\-UP\-TO\-DATE "1" "2025\-10\-05" "pyosmium" "Generated Python Manual" .SH NAME pyosmium\-up\-to\-date .SH SYNOPSIS @@ -56,9 +56,21 @@ Format the data should be saved in. Usually determined from file name. \fB\-\-server\fR \fI\,SERVER_URL\/\fR Base URL of the replication server. Default: https://planet.osm.org/replication/hour/ (hourly diffs from osm.org) +.TP +\fB\-\-diff\-type\fR \fI\,SERVER_DIFF_TYPE\/\fR +File format used by the replication server (default: osc.gz) + .TP \fB\-s\fR \fI\,SIZE\/\fR, \fB\-\-size\fR \fI\,SIZE\/\fR -Maximum size of change to apply at once in MB. Default: 1GB +Maximum size of change to apply at once in MB. Defaults to 1GB when no end ID or date was given. + +.TP +\fB\-\-end\-id\fR \fI\,ID\/\fR +Last sequence ID to download. + +.TP +\fB\-E\fR \fI\,DATE\/\fR, \fB\-\-end\-date\fR \fI\,DATE\/\fR +Do not download diffs later than the given date. .TP \fB\-\-tmpdir\fR \fI\,TMPDIR\/\fR From e3c8211efc56806ff16be25dfdff590f3b09373e Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 5 Oct 2025 10:15:00 +0200 Subject: [PATCH 5/5] use replace() instead of rename() The former cannot replace an existing file on Windows. --- src/osmium/tools/pyosmium_up_to_date.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osmium/tools/pyosmium_up_to_date.py b/src/osmium/tools/pyosmium_up_to_date.py index 53424b5..f3e7c15 100644 --- a/src/osmium/tools/pyosmium_up_to_date.py +++ b/src/osmium/tools/pyosmium_up_to_date.py @@ -153,7 +153,7 @@ def update_from_custom_server(start: ReplicationStart, options: Any) -> int: return 3 if outfile is None: - os.rename(ofname, infile) + os.replace(ofname, infile) finally: if outfile is None: try: