/
pyosmium-get-changes
executable file
·209 lines (171 loc) · 8.07 KB
/
pyosmium-get-changes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/python
"""
Fetch diffs from an OSM planet server.
The starting point of the diff must be given either as a sequence ID or a date
or can be computed from an OSM file. If no output file is given, the program
will just print the initial sequence ID it would use (or save it in a file, if
requested) and exit. This can be used to bootstrap the update process.
The program tries to download until the latest change on the server is found
or the maximum requested diff size is reached. Note that diffs are kept in
memory during download.
On success, the program will print a single number to stdout, the sequence
number where to continue updates in the next run. This output can also be
written to (and later read from) a file.
"""
from argparse import ArgumentParser, RawDescriptionHelpFormatter, ArgumentTypeError
import datetime as dt
from sys import version_info as python_version
from osmium.replication import server as rserv
from osmium.replication import newest_change_from_file
from osmium.replication.utils import get_replication_header
from osmium import SimpleHandler, WriteHandler
import re
import sys
import logging
from textwrap import dedent as msgfmt
log = logging.getLogger()
class ReplicationStart(object):
""" Represents the point where changeset download should begin.
"""
def __init__(self, date=None, seq_id=None, src=None):
self.date = date
self.seq_id = seq_id
self.source = src
def get_sequence(self, svr):
if self.seq_id is not None:
log.debug("Using given sequence ID %d" % self.seq_id)
return self.seq_id + 1
log.debug("Looking up sequence ID for timestamp %s" % self.date)
seq = svr.timestamp_to_sequence(self.date)
return seq + 1 if seq is not None else None
@staticmethod
def from_id(idstr):
try:
seq_id=int(idstr)
except ValueError:
raise ArgumentTypeError("Sequence id '%s' is not a number" % idstr)
if seq_id < 0:
raise ArgumentTypeError("Sequence id '%s' is negative" % idstr)
return ReplicationStart(seq_id=seq_id)
@staticmethod
def from_date(datestr):
try:
date = dt.datetime.strptime(datestr, "%Y-%m-%dT%H:%M:%SZ")
if python_version >= (3,0):
date = date.replace(tzinfo=dt.timezone.utc)
except ValueError:
raise ArgumentTypeError("Date needs to be in ISO8601 format (e.g. 2015-12-24T08:08:08Z).")
return ReplicationStart(date=date)
@staticmethod
def from_osm_file(fname, ignore_headers):
if ignore_headers:
ts = None
seq = None
url = None
else:
try:
(url, seq, ts) = get_replication_header(fname)
except RuntimeError as e:
raise ArgumentTypeError(e)
if ts is None and seq is None:
log.debug("OSM file has no replication headers. Looking for newest OSM object.")
try:
ts = newest_change_from_file(fname)
except RuntimeError as e:
raise ArgumentTypeError(e)
if ts is None:
raise ArgumentTypeError("OSM file does not seem to contain valid data.")
return ReplicationStart(seq_id=seq, date=ts, src=url)
def write_end_sequence(fname, seqid):
"""Either writes out the sequence file or prints the sequence id to stdout.
"""
if fname is None:
print(seqid)
else:
with open(fname, 'w') as fd:
fd.write(str(seqid))
def get_arg_parser(from_main=False):
def h(s):
return re.sub("\s\s+" , " ", s)
parser = ArgumentParser(description=__doc__,
usage=None if from_main else 'pyosmium-get-changes [options]',
formatter_class=RawDescriptionHelpFormatter)
parser.add_argument('-v', dest='loglevel', action='count', default=0,
help='Increase verbosity')
parser.add_argument('-o', '--outfile', dest='outfile',
help=h("""Name of diff output file. If omitted, only the
sequence ID will be printed where updates would start."""))
parser.add_argument('--server', action='store', dest='server_url',
help='Base URL of the replication server')
parser.add_argument('-s', '--size', dest='outsize', type=int, default=100,
help='Maximum data to load in MB (default: 100MB).')
group = parser.add_mutually_exclusive_group()
group.add_argument('-I', '--start-id', dest='start',
type=ReplicationStart.from_id, metavar='ID',
help='Sequence ID to start with')
group.add_argument('-D', '--start-date', dest='start', metavar='DATE',
type=ReplicationStart.from_date,
help='Date when to start updates')
group.add_argument('-O', '--start-osm-data', dest='start_file', metavar='OSMFILE',
help='start at the date of the newest OSM object in the file')
parser.add_argument('-f', '--sequence-file', dest='seq_file',
help=h("""Sequence file. If the file exists, then updates
will start after the id given in the file. At the
end of the process, the last sequence ID contained
in the diff is written."""))
parser.add_argument('--ignore-osmosis-headers', dest='ignore_headers',
action='store_true',
help=h("""When determining the start from an OSM file,
ignore potential replication information in the
header and search for the newest OSM object."""))
parser.add_argument('-d', '--no-deduplicate', action='store_false', dest='simplify',
help='Do not deduplicate and sort diffs.')
return parser
if __name__ == '__main__':
logging.basicConfig(stream=sys.stderr,
format='%(levelname)s: %(message)s')
options = get_arg_parser(from_main=True).parse_args()
log.setLevel(max(3 - options.loglevel, 0) * 10)
if options.start_file is not None:
options.start = ReplicationStart.from_osm_file(options.start_file,
options.ignore_headers)
if options.start is None:
if options.seq_file is None:
log.error(msgfmt("""
Don't know with which change to start. One of the parameters
-I / -D / -O / -f
needs to begiven."""))
exit(1)
with open(options.start_file, 'r') as f:
seq = f.readline()
options.start = ReplicationStart_from_id(seq)
if options.server_url is not None and options.start.source is not None:
if options.server_url != options.start.source:
log.error(msgfmt("""
You asked to use server URL:
%s
but the referenced OSM file points to replication server:
%s
If you really mean to overwrite the URL, use --ignore-osmosis-headers."""
% (options.server_url, options.start.source)))
exit(2)
url = options.server_url \
or options.start.source \
or 'https://planet.osm.org/replication/minute/'
logging.info("Using replication server at %s" % url)
svr = rserv.ReplicationServer(url)
startseq = options.start.get_sequence(svr)
if startseq is None:
log.error("Cannot read state file from server. Is the URL correct?")
exit(1)
if options.outfile is None:
write_end_sequence(options.seq_file, startseq)
exit(0)
log.debug("Starting download at ID %d (max %d MB)" % (startseq, options.outsize))
outhandler = WriteHandler(options.outfile)
endseq = svr.apply_diffs(outhandler, startseq, max_size=options.outsize*1024,
simplify=options.simplify)
outhandler.close()
if endseq is None:
exit(3)
write_end_sequence(options.seq_file, endseq)