Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ ARGPARSE_BASEARGS=--author 'Sarah Hoffmann' --author-email 'lonvia@denofr.de' --

man:
mkdir -p man
argparse-manpage --pyfile ../src/osmium/tools/pyosmium_get_changes.py --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-get-changes.1
argparse-manpage --pyfile ../src/osmium/tools/pyosmium_up_to_date.py --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-up-to-date.1
argparse-manpage --module osmium.tools.pyosmium_get_changes --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-get-changes.1
argparse-manpage --module osmium.tools.pyosmium_up_to_date --function get_arg_parser ${ARGPARSE_BASEARGS} --output man/pyosmium-up-to-date.1

.PHONY: man
12 changes: 10 additions & 2 deletions docs/man/pyosmium-get-changes.1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.TH PYOSMIUM\-GET\-CHANGES "1" "2025\-09\-09" "pyosmium" "Generated Python Manual"
.TH PYOSMIUM\-GET\-CHANGES "1" "2025\-10\-05" "pyosmium" "Generated Python Manual"
.SH NAME
pyosmium\-get\-changes
.SH SYNOPSIS
Expand Down Expand Up @@ -56,7 +56,7 @@ Netscape\-style cookie jar file to read cookies from and where received cookies

.TP
\fB\-s\fR \fI\,OUTSIZE\/\fR, \fB\-\-size\fR \fI\,OUTSIZE\/\fR
Maximum data to load in MB (default: 100MB).
Maximum data to load in MB (Defaults to 100MB when no end date/ID has been set).

.TP
\fB\-I\fR \fI\,ID\/\fR, \fB\-\-start\-id\fR \fI\,ID\/\fR
Expand All @@ -70,6 +70,14 @@ Date when to start updates
\fB\-O\fR \fI\,OSMFILE\/\fR, \fB\-\-start\-osm\-data\fR \fI\,OSMFILE\/\fR
start at the date of the newest OSM object in the file

.TP
\fB\-\-end\-id\fR \fI\,ID\/\fR
Last sequence ID to download.

.TP
\fB\-E\fR \fI\,DATE\/\fR, \fB\-\-end\-date\fR \fI\,DATE\/\fR
Do not download diffs later than the given date.

.TP
\fB\-f\fR \fI\,SEQ_FILE\/\fR, \fB\-\-sequence\-file\fR \fI\,SEQ_FILE\/\fR
Sequence file. If the file exists, then updates will start after the id given in the file. At the end of the process, the last sequence ID contained in the diff is written.
Expand Down
16 changes: 14 additions & 2 deletions docs/man/pyosmium-up-to-date.1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.TH PYOSMIUM\-UP\-TO\-DATE "1" "2025\-09\-09" "pyosmium" "Generated Python Manual"
.TH PYOSMIUM\-UP\-TO\-DATE "1" "2025\-10\-05" "pyosmium" "Generated Python Manual"
.SH NAME
pyosmium\-up\-to\-date
.SH SYNOPSIS
Expand Down Expand Up @@ -56,9 +56,21 @@ Format the data should be saved in. Usually determined from file name.
\fB\-\-server\fR \fI\,SERVER_URL\/\fR
Base URL of the replication server. Default: https://planet.osm.org/replication/hour/ (hourly diffs from osm.org)

.TP
\fB\-\-diff\-type\fR \fI\,SERVER_DIFF_TYPE\/\fR
File format used by the replication server (default: osc.gz)

.TP
\fB\-s\fR \fI\,SIZE\/\fR, \fB\-\-size\fR \fI\,SIZE\/\fR
Maximum size of change to apply at once in MB. Default: 1GB
Maximum size of change to apply at once in MB. Defaults to 1GB when no end ID or date was given.

.TP
\fB\-\-end\-id\fR \fI\,ID\/\fR
Last sequence ID to download.

.TP
\fB\-E\fR \fI\,DATE\/\fR, \fB\-\-end\-date\fR \fI\,DATE\/\fR
Do not download diffs later than the given date.

.TP
\fB\-\-tmpdir\fR \fI\,TMPDIR\/\fR
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,6 @@ include = ['/src/**/*.py',
'/contrib/protozero/LICENSE',
'/contrib/protozero/README.md',
]

[tool.pytest.ini_options]
log_cli = false
89 changes: 60 additions & 29 deletions src/osmium/replication/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# For a full list of authors see the git log.
""" Helper functions to communicate with replication servers.
"""
from typing import NamedTuple, Optional, Any, Iterator, cast, Mapping, Tuple
from typing import NamedTuple, Optional, Any, Iterator, cast, Mapping, Tuple, Dict
import urllib.request as urlrequest
from urllib.error import URLError
import datetime as dt
Expand Down Expand Up @@ -67,7 +67,7 @@ def __init__(self, url: str, diff_type: str = 'osc.gz') -> None:

self.baseurl = url
self.diff_type = diff_type
self.extra_request_params: dict[str, Any] = dict(timeout=60, stream=True)
self.extra_request_params: Dict[str, Any] = dict(timeout=60, stream=True)
self.session: Optional[requests.Session] = None
self.retry = Retry(total=3, backoff_factor=0.5, allowed_methods={'GET'},
status_forcelist=[408, 429, 500, 502, 503, 504])
Expand Down Expand Up @@ -125,12 +125,19 @@ def _get_url_with_session() -> Iterator[requests.Response]:

return _get_url_with_session()

def collect_diffs(self, start_id: int, max_size: int = 1024) -> Optional[DownloadResult]:
def collect_diffs(self, start_id: int, max_size: Optional[int] = None,
end_id: Optional[int] = None) -> Optional[DownloadResult]:
""" Create a MergeInputReader and download diffs starting with sequence
id `start_id` into it. `max_size`
restricts the number of diffs that are downloaded. The download
stops as soon as either a diff cannot be downloaded or the
unpacked data in memory exceeds `max_size` kB.
id `start_id` into it. `end_id` optionally gives the highest
sequence number to download. `max_size` restricts the number of
diffs that are downloaded by size. If neither `end_id` nor
`max_size` are given, then download default to stop after 1MB.

The download stops as soon as
1. a diff cannot be downloaded or
2. the end_id (inclusive) is reached or
3. the unpacked data in memory exceeds `max_size` kB or,
when no `end_id` and `max_size` are given, 1024kB.

If some data was downloaded, returns a namedtuple with three fields:
`id` contains the sequence id of the last downloaded diff, `reader`
Expand All @@ -140,19 +147,25 @@ def collect_diffs(self, start_id: int, max_size: int = 1024) -> Optional[Downloa
Returns None if there was an error during download or no new
data was available.
"""
left_size = max_size * 1024
current_id = start_id

# must not read data newer than the published sequence id
# or we might end up reading partial data
newest = self.get_state_info()

if newest is None or current_id > newest.sequence:
if newest is None or start_id > newest.sequence:
return None

current_id = start_id
left_size: Optional[int] = None
if max_size is not None:
left_size = max_size * 1024
elif end_id is None:
left_size = 1024 * 1024

rd = MergeInputReader()

while left_size > 0 and current_id <= newest.sequence:
while (left_size is None or left_size > 0) \
and (end_id is None or current_id <= end_id) \
and current_id <= newest.sequence:
try:
diffdata = self.get_diff_block(current_id)
except: # noqa: E722
Expand All @@ -163,21 +176,32 @@ def collect_diffs(self, start_id: int, max_size: int = 1024) -> Optional[Downloa
return None
break

left_size -= rd.add_buffer(diffdata, self.diff_type)
LOG.debug("Downloaded change %d. (%d kB available in download buffer)",
current_id, left_size / 1024)
diff_size = rd.add_buffer(diffdata, self.diff_type)
if left_size is None:
LOG.debug("Downloaded change %d.", current_id)
else:
left_size -= diff_size
LOG.debug("Downloaded change %d. (%d kB available in download buffer)",
current_id, left_size / 1024)
current_id += 1

return DownloadResult(current_id - 1, rd, newest.sequence)

def apply_diffs(self, handler: BaseHandler, start_id: int,
max_size: int = 1024, idx: str = "",
simplify: bool = True) -> Optional[int]:
max_size: Optional[int] = None,
idx: str = "", simplify: bool = True,
end_id: Optional[int] = None) -> Optional[int]:
""" Download diffs starting with sequence id `start_id`, merge them
together and then apply them to handler `handler`. `max_size`
restricts the number of diffs that are downloaded. The download
stops as soon as either a diff cannot be downloaded or the
unpacked data in memory exceeds `max_size` kB.
together and then apply them to handler `handler`. `end_id`
optionally gives the highest sequence id to download. `max_size`
allows to restrict the amount of diffs that are downloaded.
Downloaded diffs are temporarily saved in memory and this parameter
ensures that pyosmium doesn't run out of memory. `max_size`
is the maximum size in kB this internal buffer may have.

If neither `end_id` nor `max_size` are given, the download is
restricted to a maximum size of 1MB. The download also
stops when the most recent diff has been processed.

If `idx` is set, a location cache will be created and applied to
the way nodes. You should be aware that diff files usually do not
Expand All @@ -197,7 +221,7 @@ def apply_diffs(self, handler: BaseHandler, start_id: int,
The function returns the sequence id of the last diff that was
downloaded or None if the download failed completely.
"""
diffs = self.collect_diffs(start_id, max_size)
diffs = self.collect_diffs(start_id, end_id=end_id, max_size=max_size)

if diffs is None:
return None
Expand All @@ -206,19 +230,26 @@ def apply_diffs(self, handler: BaseHandler, start_id: int,

return diffs.id

def apply_diffs_to_file(self, infile: str, outfile: str,
start_id: int, max_size: int = 1024,
def apply_diffs_to_file(self, infile: str, outfile: str, start_id: int,
max_size: Optional[int] = None,
set_replication_header: bool = True,
extra_headers: Optional[Mapping[str, str]] = None,
outformat: Optional[str] = None) -> Optional[Tuple[int, int]]:
outformat: Optional[str] = None,
end_id: Optional[int] = None) -> Optional[Tuple[int, int]]:
""" Download diffs starting with sequence id `start_id`, merge them
with the data from the OSM file named `infile` and write the result
into a file with the name `outfile`. The output file must not yet
exist.

`max_size` restricts the number of diffs that are downloaded. The
download stops as soon as either a diff cannot be downloaded or the
unpacked data in memory exceeds `max_size` kB.
`end_id` optionally gives the highest sequence id to download.
`max_size` allows to restrict the amount of diffs that are
downloaded. Downloaded diffs are saved in memory and this parameter
ensures that pyosmium doesn't run out of memory. `max_size`
is the maximum size in kB this internal buffer may have.

If neither `end_id` nor `max_size` are given, the
download is restricted to a maximum size of 1MB. The download also
stops when the most recent diff has been processed.

If `set_replication_header` is true then the URL of the replication
server and the sequence id and timestamp of the last diff applied
Expand All @@ -235,7 +266,7 @@ def apply_diffs_to_file(self, infile: str, outfile: str,
newest available sequence id if new data has been written or None
if no data was available or the download failed completely.
"""
diffs = self.collect_diffs(start_id, max_size)
diffs = self.collect_diffs(start_id, end_id=end_id, max_size=max_size)

if diffs is None:
return None
Expand Down
92 changes: 92 additions & 0 deletions src/osmium/tools/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# SPDX-License-Identifier: BSD-2-Clause
#
# This file is part of pyosmium. (https://osmcode.org/pyosmium/)
#
# Copyright (C) 2025 Sarah Hoffmann <lonvia@denofr.de> and others.
# For a full list of authors see the git log.
from typing import Optional
import logging
from dataclasses import dataclass
import datetime as dt
from argparse import ArgumentTypeError

from ..replication import newest_change_from_file
from ..replication.server import ReplicationServer
from ..replication.utils import get_replication_header


log = logging.getLogger()


@dataclass
class ReplicationStart:
""" Represents the point where changeset download should begin.
"""
date: Optional[dt.datetime] = None
seq_id: Optional[int] = None
source: Optional[str] = None

def get_sequence(self, svr: ReplicationServer) -> Optional[int]:
if self.seq_id is not None:
log.debug("Using given sequence ID %d" % self.seq_id)
return self.seq_id + 1

assert self.date is not None
log.debug("Looking up sequence ID for timestamp %s" % self.date)
return svr.timestamp_to_sequence(self.date)

def get_end_sequence(self, svr: ReplicationServer) -> Optional[int]:
if self.seq_id is not None:
log.debug("Using end sequence ID %d" % self.seq_id)
return self.seq_id

assert self.date is not None
log.debug("Looking up end sequence ID for timestamp %s" % self.date)
return svr.timestamp_to_sequence(self.date)

@staticmethod
def from_id(idstr: str) -> 'ReplicationStart':
try:
seq_id = int(idstr)
except ValueError:
raise ArgumentTypeError("Sequence id '%s' is not a number" % idstr)

if seq_id < -1:
raise ArgumentTypeError("Sequence id '%s' is negative" % idstr)

return ReplicationStart(seq_id=seq_id)

@staticmethod
def from_date(datestr: str) -> 'ReplicationStart':
try:
date = dt.datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%SZ")
date = date.replace(tzinfo=dt.timezone.utc)
except ValueError:
raise ArgumentTypeError(
"Date needs to be in ISO8601 format (e.g. 2015-12-24T08:08:08Z).")

return ReplicationStart(date=date)

@staticmethod
def from_osm_file(fname: str, ignore_headers: bool) -> 'ReplicationStart':
if ignore_headers:
ts = None
seq = None
url = None
else:
try:
(url, seq, ts) = get_replication_header(fname)
except RuntimeError as e:
raise ArgumentTypeError(e)

if ts is None and seq is None:
log.debug("OSM file has no replication headers. Looking for newest OSM object.")
try:
ts = newest_change_from_file(fname)
except RuntimeError as e:
raise ArgumentTypeError(e)

if ts is None:
raise ArgumentTypeError("OSM file does not seem to contain valid data.")

return ReplicationStart(seq_id=seq, date=ts, source=url)
Loading