osmcode · lonvia · Oct 5, 2025 · Oct 4, 2025 · Oct 4, 2025 · Oct 4, 2025
diff --git a/docs/Makefile b/docs/Makefile
@@ -2,7 +2,7 @@ ARGPARSE_BASEARGS=--author 'Sarah Hoffmann' --author-email 'lonvia@denofr.de' --
 
 man:
 	mkdir -p man
-	argparse-manpage --pyfile ../src/osmium/tools/pyosmium_get_changes.py --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-get-changes.1
-	argparse-manpage --pyfile ../src/osmium/tools/pyosmium_up_to_date.py --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-up-to-date.1
+	argparse-manpage --module osmium.tools.pyosmium_get_changes --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-get-changes.1
+	argparse-manpage --module osmium.tools.pyosmium_up_to_date --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-up-to-date.1
 
 .PHONY: man
diff --git a/docs/man/pyosmium-get-changes.1 b/docs/man/pyosmium-get-changes.1
@@ -1,4 +1,4 @@
-.TH PYOSMIUM\-GET\-CHANGES "1" "2025\-09\-09" "pyosmium" "Generated Python Manual"
+.TH PYOSMIUM\-GET\-CHANGES "1" "2025\-10\-05" "pyosmium" "Generated Python Manual"
 .SH NAME
 pyosmium\-get\-changes
 .SH SYNOPSIS
@@ -56,7 +56,7 @@ Netscape\-style cookie jar file to read cookies from and where received cookies
 
 .TP
 \fB\-s\fR \fI\,OUTSIZE\/\fR, \fB\-\-size\fR \fI\,OUTSIZE\/\fR
-Maximum data to load in MB (default: 100MB).
+Maximum data to load in MB (Defaults to 100MB when no end date/ID has been set).
 
 .TP
 \fB\-I\fR \fI\,ID\/\fR, \fB\-\-start\-id\fR \fI\,ID\/\fR
@@ -70,6 +70,14 @@ Date when to start updates
 \fB\-O\fR \fI\,OSMFILE\/\fR, \fB\-\-start\-osm\-data\fR \fI\,OSMFILE\/\fR
 start at the date of the newest OSM object in the file
 
+.TP
+\fB\-\-end\-id\fR \fI\,ID\/\fR
+Last sequence ID to download.
+
+.TP
+\fB\-E\fR \fI\,DATE\/\fR, \fB\-\-end\-date\fR \fI\,DATE\/\fR
+Do not download diffs later than the given date.
+
 .TP
 \fB\-f\fR \fI\,SEQ_FILE\/\fR, \fB\-\-sequence\-file\fR \fI\,SEQ_FILE\/\fR
 Sequence file. If the file exists, then updates will start after the id given in the file. At the end of the process, the last sequence ID contained in the diff is written.

diff --git a/docs/man/pyosmium-up-to-date.1 b/docs/man/pyosmium-up-to-date.1
@@ -1,4 +1,4 @@
-.TH PYOSMIUM\-UP\-TO\-DATE "1" "2025\-09\-09" "pyosmium" "Generated Python Manual"
+.TH PYOSMIUM\-UP\-TO\-DATE "1" "2025\-10\-05" "pyosmium" "Generated Python Manual"
 .SH NAME
 pyosmium\-up\-to\-date
 .SH SYNOPSIS
@@ -56,9 +56,21 @@ Format the data should be saved in. Usually determined from file name.
 \fB\-\-server\fR \fI\,SERVER_URL\/\fR
 Base URL of the replication server. Default: https://planet.osm.org/replication/hour/ (hourly diffs from osm.org)
 
+.TP
+\fB\-\-diff\-type\fR \fI\,SERVER_DIFF_TYPE\/\fR
+File format used by the replication server (default: osc.gz)
+
 .TP
 \fB\-s\fR \fI\,SIZE\/\fR, \fB\-\-size\fR \fI\,SIZE\/\fR
-Maximum size of change to apply at once in MB. Default: 1GB
+Maximum size of change to apply at once in MB. Defaults to 1GB when no end ID or date was given.
+
+.TP
+\fB\-\-end\-id\fR \fI\,ID\/\fR
+Last sequence ID to download.
+
+.TP
+\fB\-E\fR \fI\,DATE\/\fR, \fB\-\-end\-date\fR \fI\,DATE\/\fR
+Do not download diffs later than the given date.
 
 .TP
 \fB\-\-tmpdir\fR \fI\,TMPDIR\/\fR

diff --git a/pyproject.toml b/pyproject.toml
@@ -101,3 +101,6 @@ include = ['/src/**/*.py',
            '/contrib/protozero/LICENSE',
            '/contrib/protozero/README.md',
            ]
+
+[tool.pytest.ini_options]
+log_cli = false
diff --git a/src/osmium/replication/server.py b/src/osmium/replication/server.py
@@ -6,7 +6,7 @@
 # For a full list of authors see the git log.
 """ Helper functions to communicate with replication servers.
 """
-from typing import NamedTuple, Optional, Any, Iterator, cast, Mapping, Tuple
+from typing import NamedTuple, Optional, Any, Iterator, cast, Mapping, Tuple, Dict
 import urllib.request as urlrequest
 from urllib.error import URLError
 import datetime as dt
@@ -67,7 +67,7 @@ def __init__(self, url: str, diff_type: str = 'osc.gz') -> None:
 
         self.baseurl = url
         self.diff_type = diff_type
-        self.extra_request_params: dict[str, Any] = dict(timeout=60, stream=True)
+        self.extra_request_params: Dict[str, Any] = dict(timeout=60, stream=True)
         self.session: Optional[requests.Session] = None
         self.retry = Retry(total=3, backoff_factor=0.5, allowed_methods={'GET'},
                            status_forcelist=[408, 429, 500, 502, 503, 504])
@@ -125,12 +125,19 @@ def _get_url_with_session() -> Iterator[requests.Response]:
 
         return _get_url_with_session()
 
-    def collect_diffs(self, start_id: int, max_size: int = 1024) -> Optional[DownloadResult]:
+    def collect_diffs(self, start_id: int, max_size: Optional[int] = None,
+                      end_id: Optional[int] = None) -> Optional[DownloadResult]:
         """ Create a MergeInputReader and download diffs starting with sequence
-            id `start_id` into it. `max_size`
-            restricts the number of diffs that are downloaded. The download
-            stops as soon as either a diff cannot be downloaded or the
-            unpacked data in memory exceeds `max_size` kB.
+            id `start_id` into it. `end_id` optionally gives the highest
+            sequence number to download. `max_size` restricts the number of
+            diffs that are downloaded by size. If neither `end_id` nor
+            `max_size` are given, then download default to stop after 1MB.
+
+            The download stops as soon as
+            1. a diff cannot be downloaded or
+            2. the end_id (inclusive) is reached or
+            3. the unpacked data in memory exceeds `max_size` kB or,
+               when no `end_id` and `max_size` are given, 1024kB.
 
             If some data was downloaded, returns a namedtuple with three fields:
             `id` contains the sequence id of the last downloaded diff, `reader`
@@ -140,19 +147,25 @@ def collect_diffs(self, start_id: int, max_size: int = 1024) -> Optional[Downloa
             Returns None if there was an error during download or no new
             data was available.
         """
-        left_size = max_size * 1024
-        current_id = start_id
-
         # must not read data newer than the published sequence id
         # or we might end up reading partial data
         newest = self.get_state_info()
 
-        if newest is None or current_id > newest.sequence:
+        if newest is None or start_id > newest.sequence:
             return None
 
+        current_id = start_id
+        left_size: Optional[int] = None
+        if max_size is not None:
+            left_size = max_size * 1024
+        elif end_id is None:
+            left_size = 1024 * 1024
+
         rd = MergeInputReader()
 
-        while left_size > 0 and current_id <= newest.sequence:
+        while (left_size is None or left_size > 0) \
+                and (end_id is None or current_id <= end_id) \
+                and current_id <= newest.sequence:
             try:
                 diffdata = self.get_diff_block(current_id)
             except:  # noqa: E722
@@ -163,21 +176,32 @@ def collect_diffs(self, start_id: int, max_size: int = 1024) -> Optional[Downloa
                     return None
                 break
 
-            left_size -= rd.add_buffer(diffdata, self.diff_type)
-            LOG.debug("Downloaded change %d. (%d kB available in download buffer)",
-                      current_id, left_size / 1024)
+            diff_size = rd.add_buffer(diffdata, self.diff_type)
+            if left_size is None:
+                LOG.debug("Downloaded change %d.", current_id)
+            else:
+                left_size -= diff_size
+                LOG.debug("Downloaded change %d. (%d kB available in download buffer)",
+                          current_id, left_size / 1024)
             current_id += 1
 
         return DownloadResult(current_id - 1, rd, newest.sequence)
 
     def apply_diffs(self, handler: BaseHandler, start_id: int,
-                    max_size: int = 1024, idx: str = "",
-                    simplify: bool = True) -> Optional[int]:
+                    max_size: Optional[int] = None,
+                    idx: str = "", simplify: bool = True,
+                    end_id: Optional[int] = None) -> Optional[int]:
         """ Download diffs starting with sequence id `start_id`, merge them
-            together and then apply them to handler `handler`. `max_size`
-            restricts the number of diffs that are downloaded. The download
-            stops as soon as either a diff cannot be downloaded or the
-            unpacked data in memory exceeds `max_size` kB.
+            together and then apply them to handler `handler`. `end_id`
+            optionally gives the highest sequence id to download. `max_size`
+            allows to restrict the amount of diffs that are downloaded.
+            Downloaded diffs are temporarily saved in memory and this parameter
+            ensures that pyosmium doesn't run out of memory. `max_size`
+            is the maximum size in kB this internal buffer may have.
+
+            If neither `end_id` nor `max_size` are given, the download is
+            restricted to a maximum size of 1MB. The download also
+            stops when the most recent diff has been processed.
 
             If `idx` is set, a location cache will be created and applied to
             the way nodes. You should be aware that diff files usually do not
@@ -197,7 +221,7 @@ def apply_diffs(self, handler: BaseHandler, start_id: int,
             The function returns the sequence id of the last diff that was
             downloaded or None if the download failed completely.
         """
-        diffs = self.collect_diffs(start_id, max_size)
+        diffs = self.collect_diffs(start_id, end_id=end_id, max_size=max_size)
 
         if diffs is None:
             return None
@@ -206,19 +230,26 @@ def apply_diffs(self, handler: BaseHandler, start_id: int,
 
         return diffs.id
 
-    def apply_diffs_to_file(self, infile: str, outfile: str,
-                            start_id: int, max_size: int = 1024,
+    def apply_diffs_to_file(self, infile: str, outfile: str, start_id: int,
+                            max_size: Optional[int] = None,
                             set_replication_header: bool = True,
                             extra_headers: Optional[Mapping[str, str]] = None,
-                            outformat: Optional[str] = None) -> Optional[Tuple[int, int]]:
+                            outformat: Optional[str] = None,
+                            end_id: Optional[int] = None) -> Optional[Tuple[int, int]]:
         """ Download diffs starting with sequence id `start_id`, merge them
             with the data from the OSM file named `infile` and write the result
             into a file with the name `outfile`. The output file must not yet
             exist.
 
-            `max_size` restricts the number of diffs that are downloaded. The
-            download stops as soon as either a diff cannot be downloaded or the
-            unpacked data in memory exceeds `max_size` kB.
+            `end_id` optionally gives the highest sequence id to download.
+            `max_size` allows to restrict the amount of diffs that are
+            downloaded. Downloaded diffs are saved in memory and this parameter
+            ensures that pyosmium doesn't run out of memory. `max_size`
+            is the maximum size in kB this internal buffer may have.
+
+            If neither `end_id` nor `max_size` are given, the
+            download is restricted to a maximum size of 1MB. The download also
+            stops when the most recent diff has been processed.
 
             If `set_replication_header` is true then the URL of the replication
             server and the sequence id and timestamp of the last diff applied
@@ -235,7 +266,7 @@ def apply_diffs_to_file(self, infile: str, outfile: str,
             newest available sequence id if new data has been written or None
             if no data was available or the download failed completely.
         """
-        diffs = self.collect_diffs(start_id, max_size)
+        diffs = self.collect_diffs(start_id, end_id=end_id, max_size=max_size)
 
         if diffs is None:
             return None

diff --git a/src/osmium/tools/common.py b/src/osmium/tools/common.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: BSD-2-Clause
+#
+# This file is part of pyosmium. (https://osmcode.org/pyosmium/)
+#
+# Copyright (C) 2025 Sarah Hoffmann <lonvia@denofr.de> and others.
+# For a full list of authors see the git log.
+from typing import Optional
+import logging
+from dataclasses import dataclass
+import datetime as dt
+from argparse import ArgumentTypeError
+
+from ..replication import newest_change_from_file
+from ..replication.server import ReplicationServer
+from ..replication.utils import get_replication_header
+
+
+log = logging.getLogger()
+
+
+@dataclass
+class ReplicationStart:
+    """ Represents the point where changeset download should begin.
+    """
+    date: Optional[dt.datetime] = None
+    seq_id: Optional[int] = None
+    source: Optional[str] = None
+
+    def get_sequence(self, svr: ReplicationServer) -> Optional[int]:
+        if self.seq_id is not None:
+            log.debug("Using given sequence ID %d" % self.seq_id)
+            return self.seq_id + 1
+
+        assert self.date is not None
+        log.debug("Looking up sequence ID for timestamp %s" % self.date)
+        return svr.timestamp_to_sequence(self.date)
+
+    def get_end_sequence(self, svr: ReplicationServer) -> Optional[int]:
+        if self.seq_id is not None:
+            log.debug("Using end sequence ID %d" % self.seq_id)
+            return self.seq_id
+
+        assert self.date is not None
+        log.debug("Looking up end sequence ID for timestamp %s" % self.date)
+        return svr.timestamp_to_sequence(self.date)
+
+    @staticmethod
+    def from_id(idstr: str) -> 'ReplicationStart':
+        try:
+            seq_id = int(idstr)
+        except ValueError:
+            raise ArgumentTypeError("Sequence id '%s' is not a number" % idstr)
+
+        if seq_id < -1:
+            raise ArgumentTypeError("Sequence id '%s' is negative" % idstr)
+
+        return ReplicationStart(seq_id=seq_id)
+
+    @staticmethod
+    def from_date(datestr: str) -> 'ReplicationStart':
+        try:
+            date = dt.datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%SZ")
+            date = date.replace(tzinfo=dt.timezone.utc)
+        except ValueError:
+            raise ArgumentTypeError(
+                "Date needs to be in ISO8601 format (e.g. 2015-12-24T08:08:08Z).")
+
+        return ReplicationStart(date=date)
+
+    @staticmethod
+    def from_osm_file(fname: str, ignore_headers: bool) -> 'ReplicationStart':
+        if ignore_headers:
+            ts = None
+            seq = None
+            url = None
+        else:
+            try:
+                (url, seq, ts) = get_replication_header(fname)
+            except RuntimeError as e:
+                raise ArgumentTypeError(e)
+
+        if ts is None and seq is None:
+            log.debug("OSM file has no replication headers. Looking for newest OSM object.")
+            try:
+                ts = newest_change_from_file(fname)
+            except RuntimeError as e:
+                raise ArgumentTypeError(e)
+
+            if ts is None:
+                raise ArgumentTypeError("OSM file does not seem to contain valid data.")
+
+        return ReplicationStart(seq_id=seq, date=ts, source=url)