-
Notifications
You must be signed in to change notification settings - Fork 530
/
quality_control.py
624 lines (518 loc) · 23.8 KB
/
quality_control.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
# -*- coding: utf-8 -*-
"""
Quality control module for ObsPy.
Currently requires MiniSEED files as that is the dominant data format in
data centers. Please see the documentation of the
:class:`~obspy.signal.quality_control.MSEEDMetadata` class for usage
instructions.
:authors:
Luca Trani (trani@knmi.nl)
Lion Krischer (krischer@geophysik.uni-muenchen.de)
Mathijs Koymans (koymans@knmi.nl)
:copyright:
The ObsPy Development Team (devs@obspy.org)
:license:
GNU Lesser General Public License, Version 3
(http://www.gnu.org/copyleft/lesser.html)
"""
import io
import json
from operator import attrgetter
from pathlib import Path
from uuid import uuid4
import numpy as np
from obspy import Stream, UTCDateTime, read, __version__
from obspy.core.compatibility import collections_abc
from obspy.core.util.base import get_dependency_version
from obspy.io.mseed.util import get_flags
_PRODUCER = "ObsPy %s" % __version__
class DataQualityEncoder(json.JSONEncoder):
"""
Custom encoder capable of dealing with NumPy and ObsPy types.
"""
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, UTCDateTime):
return str(obj)
else:
return super(DataQualityEncoder, self).default(obj)
class MSEEDMetadata(object):
"""
A container for MiniSEED specific metadata, including quality control
parameters.
Reads the MiniSEED files and extracts the data quality metrics. All
MiniSEED files must have a matching stream ID and quality.
:param files: One ore more MiniSEED files.
:type files: str or list[str]
:type id: str, optional
:param id: A unique identifier of the to be created QC object. It is
not verified, that it actually is unique. The user has to take care of
that. If no id is given, uuid.uuid4() will be used to
create one which assures uniqueness within one Python run.
If no fixed id is provided, the ID will be built from prefix
and a random uuid hash.
:type prefix: str, optional
:param prefix: An optional identifier that will be put in front of any
automatically created id. The prefix will only have an effect
if `id` is not specified (for a fixed ID string).
:param starttime: Only use records whose end time is larger then this
given time. Also specifies the new official start time of the
metadata object.
:type starttime: :class:`obspy.core.utcdatetime.UTCDateTime`
:param endtime: Only use records whose start time is smaller then this
given time. Also specifies the new official end time of the
metadata object
:type endtime: :class:`obspy.core.utcdatetime.UTCDateTime`
:param add_c_segments: Calculate metrics for each continuous segment.
:type add_c_segments: bool
:param add_flags: Include MiniSEED header statistics in result.
:type add_flags: bool
:param waveform_type: The type of waveform data, e.g. ``"seismic"``,
``"infrasound"``, ...
:type waveform_type: str
.. rubric:: Example
>>> from obspy.signal.quality_control import
... MSEEDMetadata #doctest: +SKIP
>>> mseedqc = MSEEDMetadata(['path/to/file',
... 'path/to/file2']) # doctest: +SKIP
The class requires a list of files for calculating metrics.
Add optional parameters ``starttime="YYYY-MM-DDThh:mm:ss`` and
``endtime="YYYY-MM-DDThh:mm:ss"`` or ``obspy.core.utcdatetime.UTCDateTime``
to limit metric calculation to this window. Continuous segments are
returned when ``add_c_segments=True`` and MiniSEED header flags information
is returned when ``add_flags=True``.
The calculated metrics are then available in the ``.meta`` dictionary.
>>> mseedqc.meta # doctest: +SKIP
This is intended to be serialized as JSON. Retrieve the JSON string (to
for example store it in a database or save to a file) with:
>>> mseedqc.get_json_meta() #doctest: +SKIP
"""
def __init__(self, files, id=None, prefix="smi:local/qc",
starttime=None, endtime=None,
add_c_segments=True, add_flags=False,
waveform_type="seismic"):
"""
Reads the MiniSEED files and extracts the data quality metrics.
"""
self.data = Stream()
self.all_files = files
self.files = []
# Allow anything UTCDateTime can parse.
if starttime is not None:
starttime = UTCDateTime(starttime)
if endtime is not None:
endtime = UTCDateTime(endtime)
self.window_start = starttime
self.window_end = endtime
# We are required to exclude samples at T1. Therefore, shift the
# time window to the left by 1μs and set nearest_sample to False.
# This will force ObsPy to fall back to the sample left of the endtime
if endtime is not None:
endtime_left = endtime - 1e-6
else:
endtime_left = None
# Will raise if not a MiniSEED files.
for file in files:
st = read(file, starttime=starttime, endtime=endtime_left,
format="mseed", nearest_sample=False)
# Empty stream or maybe there is no data in the stream for the
# requested time span.
if not st:
continue
self.files.append(file)
# Only extend traces with data (npts > 0)
for tr in st:
if tr.stats.npts != 0:
self.data.extend([tr])
if not self.data:
raise ValueError("No data within the temporal constraints.")
# Do some sanity checks. The class only works with data from a
# single location so we have to make sure that the existing data on
# this object and the newly added all have the same identifier.
ids = [tr.id + "." + str(tr.stats.mseed.dataquality) for tr in
self.data]
if len(set(ids)) != 1:
raise ValueError("All traces must have the same SEED id and "
"quality.")
# Get the last sample and add delta
final_trace = max(self.data, key=attrgetter('stats.endtime')).stats
self.endtime = endtime or final_trace.endtime + final_trace.delta
self.data.sort()
# Set the metric start and endtime specified by the user.
# If no start and endtime are given, we pick our own, and the window
# will start on the first sample and end on the last sample + Δt.
# This is conform to the definition of [T0, T1).
self.starttime = starttime or self.data[0].stats.starttime
self.total_time = self.endtime - self.starttime
if id is None:
id = prefix + "/" + str(uuid4())
# Fill with the meta information.
self.meta = {
"wfmetadata_id": id,
"producer": _PRODUCER,
"waveform_type": waveform_type,
"waveform_format": "miniSEED",
"version": "1.0.0"
}
# Get sample left of the user specified starttime
# This will allow us to determine start continuity in our window
self._get_gaps_and_overlaps()
# The calculation of all the metrics begins here
self._extract_mseed_stream_metadata()
self._compute_sample_metrics()
if add_flags:
self._extract_mseed_flags()
if add_c_segments:
self._compute_continuous_seg_sample_metrics()
def _get_gaps_and_overlaps(self):
"""
Function to get all gaps and overlaps in the user
specified (or forced) window.
"""
self.all_data = Stream()
body_gap = []
body_overlap = []
# Read all the files entirely and calculate gaps and overlaps
# for the entire segment. Later we will narrow it to our window if
# it has been specified
for file in self.all_files:
self.all_data.extend(read(file, format="mseed", headonly=True))
# Sort the data by so the start times are in order
self.all_data.sort()
# Coverage keeps track of the time used
coverage = None
for trace in self.all_data:
# Extend the endtime of a trace with delta
trace_end = trace.stats.endtime + trace.stats.delta
trace_start = trace.stats.starttime
# If a start boundary has been specified
if self.window_start is not None:
if trace_end <= self.window_start:
continue
# In case the start time of a trace comes before the window
# extend the length to the window start
cut_trace_start = max(trace_start, self.window_start)
else:
cut_trace_start = trace_start
# If a end boundary has been specified
if self.window_end is not None:
if trace_start > self.window_end:
continue
# In case the end time of a trace comes after the window
# reduce the length to the window end
cut_trace_end = min(trace_end, self.window_end)
else:
cut_trace_end = trace_end
# Calculate the trace time tolerance as 0.5 * delta
time_tolerance_max = trace_end + 0.5 * trace.stats.delta
time_tolerance_min = trace_end - 0.5 * trace.stats.delta
# Set the initial trace coverage and proceed to the next trace
if coverage is None:
coverage = {
'start': trace_start,
'end': trace_end,
'end_min': time_tolerance_min,
'end_max': time_tolerance_max
}
continue
# Check if the start time of a trace falls
# beyond covered end max (time tolerance)
# this must be interpreted as a gap
if trace_start > coverage['end_max']:
body_gap.append(cut_trace_start - coverage['end'])
# Check overlap in coverage
# the overlap ends at the end of the trace
# or add the end of the coverage
if trace_start <= coverage['end_min']:
min_end = min(cut_trace_end, coverage['end'])
body_overlap.append(min_end - cut_trace_start)
# Extend the coverage of the trace
# the start coverage remains unchanged
if trace_end > coverage['end']:
coverage['end'] = trace_end
coverage['end_min'] = time_tolerance_min
coverage['end_max'] = time_tolerance_max
# We have the coverage by the traces
# check if there is an end or start gap caused by the
# window forced by the user
self.meta['start_gap'] = None
self.meta['end_gap'] = None
if self.window_start is not None:
if coverage['start'] > self.window_start:
self.meta['start_gap'] = coverage['start'] - self.window_start
body_gap.append(self.meta['start_gap'])
if self.window_end is not None:
if coverage['end'] < self.window_end:
self.meta['end_gap'] = self.window_end - coverage['end']
body_gap.append(self.meta['end_gap'])
# Set the gap and overlap information
self.meta['num_gaps'] = len(body_gap)
self.meta['sum_gaps'] = sum(body_gap)
if len(body_gap) != 0:
self.meta['max_gap'] = max(body_gap)
else:
self.meta['max_gap'] = None
self.meta['num_overlaps'] = len(body_overlap)
self.meta['sum_overlaps'] = sum(body_overlap)
if len(body_overlap) != 0:
self.meta['max_overlap'] = max(body_overlap)
else:
self.meta['max_overlap'] = None
@property
def number_of_records(self):
"""
Number of records across files before slicing.
"""
return sum(tr.stats.mseed.number_of_records for tr in self.data)
@property
def number_of_samples(self):
"""
Number of samples across files.
"""
return sum(tr.stats.npts for tr in self.data)
def _extract_mseed_stream_stats(self):
"""
Small function to collects the mSEED stats
"""
stats = self.data[0].stats
self.meta['network'] = stats.network
self.meta['station'] = stats.station
self.meta['location'] = stats.location
self.meta['channel'] = stats.channel
self.meta['quality'] = stats.mseed.dataquality
def _extract_mseed_stream_metadata(self):
"""
Collect information from the MiniSEED headers.
"""
self._extract_mseed_stream_stats()
meta = self.meta
# Save first and last sample of the trace
# Look for the maximum endtime and minimum starttime in case
# traces are not in order.
meta['first_sample'] = min(tr.stats.starttime for tr in self.data)
meta['last_sample'] = max(tr.stats.endtime for tr in self.data)
# Add some other parameters to the metadata object
meta['seed_id'] = self.data[0].id
meta['files'] = self.files
meta['start_time'] = self.starttime
meta['end_time'] = self.endtime
meta['num_samples'] = self.number_of_samples
# The number records as given by Trace.stats is always
# the full number of records in a file, regardless of being
# sliced between a start & endtime
# If a start/endtime is specified, we cannot be sure of
# the # records. We will add this parameter later
# if add_flags is set to true after looping all records
if self.window_start is None and self.window_end is None:
meta['num_records'] = self.number_of_records
else:
meta['num_records'] = None
# The following are lists and may contain multiple unique entries.
meta['sample_rate'] = \
sorted(list(set([tr.stats.sampling_rate for tr in self.data])))
meta['record_length'] = \
sorted(list(set([tr.stats.mseed.record_length
for tr in self.data])))
meta['encoding'] = \
sorted(list(set([tr.stats.mseed.encoding for tr in self.data])))
def _extract_mseed_flags(self):
flags = get_flags(self.files, starttime=self.starttime,
endtime=self.endtime)
data_quality_flags_seconds = flags["data_quality_flags_percentages"]
activity_flags_seconds = flags["activity_flags_percentages"]
io_and_clock_flags_seconds = flags["io_and_clock_flags_percentages"]
timing_correction = flags["timing_correction"]
# Only calculate the timing quality statistics if each files has the
# timing quality set. This should usually be the case. Otherwise we
# would created tinted statistics. There is still a chance that some
# records in a file have timing qualities set and others not but
# that should be small.
if flags["timing_quality"]:
tq = flags["timing_quality"]
timing_quality_mean = tq["mean"]
timing_quality_min = tq["min"]
timing_quality_max = tq["max"]
timing_quality_median = tq["median"]
timing_quality_lower_quartile = tq["lower_quartile"]
timing_quality_upper_quartile = tq["upper_quartile"]
else:
timing_quality_mean = None
timing_quality_min = None
timing_quality_max = None
timing_quality_median = None
timing_quality_lower_quartile = None
timing_quality_upper_quartile = None
meta = self.meta
meta['num_records'] = flags['record_count']
# Set MiniSEED header counts
meta['miniseed_header_counts'] = {}
ref = meta['miniseed_header_counts']
ref['timing_correction'] = flags['timing_correction_count']
ref['activity_flags'] = flags['activity_flags_counts']
ref['io_and_clock_flags'] = flags['io_and_clock_flags_counts']
ref['data_quality_flags'] = flags['data_quality_flags_counts']
# Set MiniSEED header percentages
meta['miniseed_header_percentages'] = {}
ref = meta['miniseed_header_percentages']
ref['timing_correction'] = timing_correction
ref['timing_quality_mean'] = timing_quality_mean
ref['timing_quality_min'] = timing_quality_min
ref['timing_quality_max'] = timing_quality_max
ref['timing_quality_median'] = timing_quality_median
ref['timing_quality_lower_quartile'] = timing_quality_lower_quartile
ref['timing_quality_upper_quartile'] = timing_quality_upper_quartile
# According to schema @ maybe refactor this to less verbose flag
# names. Sets MiniSEED header flag percentages
ref['activity_flags'] = activity_flags_seconds
ref['data_quality_flags'] = data_quality_flags_seconds
ref['io_and_clock_flags'] = io_and_clock_flags_seconds
def _compute_sample_metrics(self):
"""
Computes metrics on samples contained in the specified time window
"""
# Make sure there is no integer division by chance.
npts = float(self.number_of_samples)
self.meta['sample_min'] = min([tr.data.min() for tr in self.data])
self.meta['sample_max'] = max([tr.data.max() for tr in self.data])
# Manually implement these as they have to work across a list of
# arrays.
self.meta['sample_mean'] = \
sum(tr.data.sum() for tr in self.data) / npts
full_samples = np.concatenate([tr.data for tr in self.data])
self.meta['sample_median'] = np.median(full_samples)
self.meta['sample_lower_quartile'] = np.percentile(full_samples, 25)
self.meta['sample_upper_quartile'] = np.percentile(full_samples, 75)
# Might overflow np.int64 so make Python obj. (.astype(object))
# allows conversion to long int when required (see tests)
self.meta['sample_rms'] = \
np.sqrt(sum((tr.data.astype(object) ** 2).sum()
for tr in self.data) / npts)
self.meta['sample_stdev'] = np.sqrt(sum(
((tr.data - self.meta["sample_mean"]) ** 2).sum()
for tr in self.data) / npts)
# Percentage based availability as a function of total gap length
# over the full trace duration
self.meta['percent_availability'] = 100 * (
(self.total_time - self.meta['sum_gaps']) /
self.total_time)
def _compute_continuous_seg_sample_metrics(self):
"""
Computes metrics on the samples within each continuous segment.
"""
if not self.data:
return
if self.meta['start_gap'] is None and self.window_start is not None:
first_segment_start = self.window_start
else:
first_segment_start = self.data[0].stats.starttime
# Collect data in arrays of continuous segments
# Manually set the first segment
c_seg = {
'start': first_segment_start,
'data': self.data[0].data
}
c_segs = []
for i in range(len(self.data)):
trace_end = self.data[i].stats.endtime + self.data[i].stats.delta
time_tolerance = 0.5 * self.data[i].stats.delta
c_seg['s_rate'] = self.data[i].stats.sampling_rate
c_seg['end'] = trace_end
# Final trace, make sure to append
if i == len(self.data) - 1:
c_segs.append(c_seg)
break
trace_offset = abs(self.data[i + 1].stats.starttime - trace_end)
# Check if trace endtime is equal to (trace + 1) starttime
# and if the sampling_rates match, if so, extend the data with
# data from trace + 1 and extend the endtime
# Otherwise the segment stops being continuous and we append it
# and we create a new data segment
if (trace_offset < time_tolerance and
self.data[i + 1].stats.sampling_rate == c_seg['s_rate']):
c_seg['data'] = np.concatenate((c_seg['data'],
self.data[i + 1].data))
c_seg['end'] = self.data[i + 1].stats.endtime + \
self.data[i + 1].stats.delta
else:
c_segs.append(c_seg)
c_seg = {
'data': self.data[i + 1].data,
'start': self.data[i + 1].stats.starttime}
# Set array of continuous segments from this data
self.meta['c_segments'] = [self._parse_c_stats(seg) for seg in c_segs]
def _parse_c_stats(self, tr):
"""
:param tr: custom dictionary with start, end, data, and sampling_rate
of a continuous trace
"""
seg = {}
# Set continuous segments start & end
# limit to specified window start/end if set
if self.window_start is not None:
seg['start_time'] = max(self.window_start, tr['start'])
else:
seg['start_time'] = tr['start']
if self.window_end is not None:
seg['end_time'] = min(self.window_end, tr['end'])
else:
seg['end_time'] = tr['end']
seg['sample_rate'] = tr['s_rate']
seg['sample_min'] = tr['data'].min()
seg['sample_max'] = tr['data'].max()
seg['sample_mean'] = tr['data'].mean()
seg['sample_median'] = np.median(tr['data'])
seg['sample_rms'] = np.sqrt((tr['data'].astype(object) ** 2).sum() /
len(tr['data']))
seg['sample_lower_quartile'] = np.percentile(tr['data'], 25)
seg['sample_upper_quartile'] = np.percentile(tr['data'], 75)
seg['sample_stdev'] = tr['data'].std()
seg['num_samples'] = len(tr['data'])
seg['segment_length'] = seg['end_time'] - seg['start_time']
return seg
def get_json_meta(self, validate=False):
"""
Serialize the meta dictionary to JSON.
:param validate: Validate the JSON string against the schema before
returning.
:type validate: bool
:return: JSON containing the MSEED metadata
"""
meta = json.dumps(self.meta, cls=DataQualityEncoder, indent=4)
if validate:
self.validate_qc_metrics(meta)
return meta
def validate_qc_metrics(self, qc_metrics):
"""
Validate the passed metrics against the JSON schema.
:param qc_metrics: The quality metrics to be validated.
:type qc_metrics: dict, str, or file-like object
"""
import jsonschema
# Judging from the changelog 1.0.0 appears to be the first version
# to have fully working support for references.
_v = get_dependency_version("jsonschema")
if _v < [1, 0, 0]: # pragma: no cover
msg = ("Validating the QC metrics requires jsonschema >= 1.0.0 "
"You have %s. Please update." %
get_dependency_version("jsonschema", raw_string=True))
raise ValueError(msg)
schema_path = Path(__file__).parent/"data"/"wf_metadata_schema.json"
with io.open(schema_path, "rt") as fh:
schema = json.load(fh)
# If passed as a dictionary, serialize and derialize to get the
# mapping from Python object to JSON type.
if isinstance(qc_metrics, collections_abc.Mapping):
qc_metrics = json.loads(self.get_json_meta(validate=False))
elif hasattr(qc_metrics, "read"):
qc_metrics = json.load(qc_metrics)
else:
qc_metrics = json.loads(qc_metrics)
jsonschema.validate(qc_metrics, schema)
if __name__ == '__main__':
import doctest
doctest.testmod(exclude_empty=True)