Merge 749a8aa into 2786cb1

obspy · Aug 4, 2016 · 8971429 · 8971429
2 parents 2786cb1 + 749a8aa
commit 8971429
Show file tree

Hide file tree

Showing 10 changed files with 272 additions and 68 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,3 +1,7 @@
+1.0.3:
+ - obspy.io.mseed:
+   * ObsPy can now also read (Mini)SEED files with noise records. (see #1495)
+
 1.0.2: (doi: 10.5281/zenodo.49636)
  - obspy.core:
    * Added workaround for numpy issue where many FFTs of various lengths fill

diff --git a/obspy/io/mseed/__init__.py b/obspy/io/mseed/__init__.py
@@ -146,6 +146,14 @@
 from future.builtins import *  # NOQA
 
 
+class InternalMSEEDReadingError(Exception):
+    pass
+
+
+class InternalMSEEDReadingWarning(UserWarning):
+    pass
+
+
 if __name__ == '__main__':
     import doctest
     doctest.testmod(exclude_empty=True)
diff --git a/obspy/io/mseed/core.py b/obspy/io/mseed/core.py
@@ -8,6 +8,7 @@
 from future.utils import native_str
 
 import ctypes as C
+import io
 import os
 import warnings
 from struct import pack
@@ -19,21 +20,13 @@
 from obspy.core.util import NATIVE_BYTEORDER
 from obspy.core.util.deprecation_helpers import (
     DynamicAttributeImportRerouteModule, ObsPyDeprecationWarning)
-from . import util
+from . import util, InternalMSEEDReadingError, InternalMSEEDReadingWarning
 from .headers import (DATATYPES, ENCODINGS, HPTERROR, HPTMODULUS, SAMPLETYPE,
                       SEED_CONTROL_HEADERS, UNSUPPORTED_ENCODINGS,
                       VALID_CONTROL_HEADERS, VALID_RECORD_LENGTHS, Selections,
                       SelectTime, Blkt100S, Blkt1001S, clibmseed)
 
 
-class InternalMSEEDReadingError(Exception):
-    pass
-
-
-class InternalMSEEDReadingWarning(UserWarning):
-    pass
-
-
 def _is_mseed(filename):
     """
     Checks whether a file is Mini-SEED/full SEED or not.
@@ -52,58 +45,104 @@ def _is_mseed(filename):
 
     Thus it cannot be used to validate a Mini-SEED or SEED file.
     """
-    with open(filename, 'rb') as fp:
-        header = fp.read(7)
-        # File has less than 7 characters
-        if len(header) != 7:
-            return False
-        # Sequence number must contains a single number or be empty
-        seqnr = header[0:6].replace(b'\x00', b' ').strip()
-        if not seqnr.isdigit() and seqnr != b'':
-            return False
-        # Check for any valid control header types.
-        if header[6:7] in [b'D', b'R', b'Q', b'M']:
-            return True
-        # Check if Full-SEED
-        if not header[6:7] == b'V':
-            return False
-        # Parse the whole file and check whether it has has a data record.
-        fp.seek(1, 1)
-        _i = 0
-        # search for blockettes 010 or 008
-        while True:
-            if fp.read(3) in [b'010', b'008']:
-                break
-            # the next for bytes are the record length
-            # as we are currently at position 7 (fp.read(3) fp.read(4))
-            # we need to subtract this first before we seek
-            # to the appropriate position
+    # Open filehandler or use an existing file like object.
+    if not hasattr(filename, 'read'):
+        file_size = os.path.getsize(filename)
+        with io.open(filename, 'rb') as fh:
+            return __is_mseed(fh, file_size=file_size)
+    else:
+        initial_pos = filename.tell()
+        try:
+            if hasattr(filename, "getbuffer"):
+                file_size = filename.getbuffer().nbytes
             try:
-                fp.seek(int(fp.read(4)) - 7, 1)
+                file_size = os.fstat(filename.fileno()).st_size
             except:
-                return False
-            _i += 1
-            # break after 3 cycles
-            if _i == 3:
-                return False
-        # Try to get a record length.
-        fp.seek(8, 1)
+                _p = filename.tell()
+                filename.seek(0, 2)
+                file_size = filename.tell()
+                filename.seek(_p, 0)
+            return __is_mseed(filename, file_size)
+        finally:
+            # Reset pointer.
+            filename.seek(initial_pos, 0)
+
+
+def __is_mseed(fp, file_size):  # NOQA
+    """
+    Internal version of _is_mseed working only with open file-like object.
+    """
+    header = fp.read(7)
+    # File has less than 7 characters
+    if len(header) != 7:
+        return False
+    # Sequence number must contains a single number or be empty
+    seqnr = header[0:6].replace(b'\x00', b' ').strip()
+    if not seqnr.isdigit() and seqnr != b'':
+        # This might be a completely empty sequence - in that case jump 128
+        # bytes and try again.
+        fp.seek(-7, 1)
         try:
-            record_length = pow(2, int(fp.read(2)))
+            _t = fp.read(128).decode().strip()
         except:
             return False
-        file_size = os.path.getsize(filename)
-        # Jump to the second record.
-        fp.seek(record_length + 6)
-        # Loop over all records and return True if one record is a data
-        # record
-        while fp.tell() < file_size:
-            flag = fp.read(1)
-            if flag in [b'D', b'R', b'Q', b'M']:
-                return True
-            fp.seek(record_length - 1, 1)
+        if not _t:
+            return __is_mseed(fp=fp, file_size=file_size)
+        return False
+    # Check for any valid control header types.
+    if header[6:7] in [b'D', b'R', b'Q', b'M']:
+        return True
+    elif header[6:7] == b" ":
+        # If empty, it might be a noise record. Check the rest of 128 bytes
+        # (min record size) and try again.
+        try:
+            _t = fp.read(128 - 7).decode().strip()
+        except:
+            return False
+        if not _t:
+            return __is_mseed(fp=fp, file_size=file_size)
+        return False
+    # Check if Full-SEED
+    elif header[6:7] != b'V':
+        return False
+    # Parse the whole file and check whether it has has a data record.
+    fp.seek(1, 1)
+    _i = 0
+    # search for blockettes 010 or 008
+    while True:
+        if fp.read(3) in [b'010', b'008']:
+            break
+        # the next for bytes are the record length
+        # as we are currently at position 7 (fp.read(3) fp.read(4))
+        # we need to subtract this first before we seek
+        # to the appropriate position
+        try:
+            fp.seek(int(fp.read(4)) - 7, 1)
+        except:
+            return False
+        _i += 1
+        # break after 3 cycles
+        if _i == 3:
+            return False
+
+    # Try to get a record length.
+    fp.seek(8, 1)
+    try:
+        record_length = pow(2, int(fp.read(2)))
+    except:
         return False
 
+    # Jump to the second record.
+    fp.seek(record_length + 6, 0)
+    # Loop over all records and return True if one record is a data
+    # record
+    while fp.tell() < file_size:
+        flag = fp.read(1)
+        if flag in [b'D', b'R', b'Q', b'M']:
+            return True
+        fp.seek(record_length - 1, 1)
+    return False
+
 
 def _read_mseed(mseed_object, starttime=None, endtime=None, headonly=False,
                 sourcename=None, reclen=None, details=False,

diff --git a/obspy/io/mseed/headers.py b/obspy/io/mseed/headers.py
@@ -42,7 +42,8 @@
 # Valid control headers in ASCII numbers.
 SEED_CONTROL_HEADERS = [ord('V'), ord('A'), ord('S'), ord('T')]
 MINI_SEED_CONTROL_HEADERS = [ord('D'), ord('R'), ord('Q'), ord('M')]
-VALID_CONTROL_HEADERS = SEED_CONTROL_HEADERS + MINI_SEED_CONTROL_HEADERS
+VALID_CONTROL_HEADERS = SEED_CONTROL_HEADERS + MINI_SEED_CONTROL_HEADERS + \
+    [ord(' ')]
 
 # expected data types for libmseed id: (numpy, ctypes)
 DATATYPES = {b"a": C.c_char, b"i": C.c_int32, b"f": C.c_float,

diff --git a/obspy/io/mseed/src/obspy-readbuffer.c b/obspy/io/mseed/src/obspy-readbuffer.c
@@ -17,6 +17,32 @@
 #include "libmseed/unpackdata.h"
 
 
+// Similar to MS_ISVALIDBLANK but also works for blocks consisting only of
+// spaces.
+#define OBSPY_ISVALIDBLANK(X) (                            \
+  (isdigit ((int) *(X))   || !*(X)   || *(X) == ' ') &&    \
+  (isdigit ((int) *(X+1)) || !*(X+1) || *(X+1) == ' ') &&  \
+  (isdigit ((int) *(X+2)) || !*(X+2) || *(X+2) == ' ') &&  \
+  (isdigit ((int) *(X+3)) || !*(X+3) || *(X+3) == ' ') &&  \
+  (isdigit ((int) *(X+4)) || !*(X+4) || *(X+4) == ' ') &&  \
+  (isdigit ((int) *(X+5)) || !*(X+5) || *(X+5) == ' ') &&  \
+  (*(X+6) ==' ') && (*(X+7) ==' ') && (*(X+8) ==' ') &&  \
+  (*(X+9) ==' ') && (*(X+10)==' ') && (*(X+11)==' ') &&  \
+  (*(X+12)==' ') && (*(X+13)==' ') && (*(X+14)==' ') &&  \
+  (*(X+15)==' ') && (*(X+16)==' ') && (*(X+17)==' ') &&  \
+  (*(X+18)==' ') && (*(X+19)==' ') && (*(X+20)==' ') &&  \
+  (*(X+21)==' ') && (*(X+22)==' ') && (*(X+23)==' ') &&  \
+  (*(X+24)==' ') && (*(X+25)==' ') && (*(X+26)==' ') &&  \
+  (*(X+27)==' ') && (*(X+28)==' ') && (*(X+29)==' ') &&  \
+  (*(X+30)==' ') && (*(X+31)==' ') && (*(X+32)==' ') &&  \
+  (*(X+33)==' ') && (*(X+34)==' ') && (*(X+35)==' ') &&  \
+  (*(X+36)==' ') && (*(X+37)==' ') && (*(X+38)==' ') &&  \
+  (*(X+39)==' ') && (*(X+40)==' ') && (*(X+41)==' ') &&  \
+  (*(X+42)==' ') && (*(X+43)==' ') && (*(X+44)==' ') &&  \
+  (*(X+45)==' ') && (*(X+46)==' ') && (*(X+47)==' ') )
+
+
+
 // Dummy wrapper around malloc.
 void * allocate_bytes(int count) {
     return malloc(count);
@@ -268,7 +294,7 @@ readMSEEDBuffer (char *mseed, int buflen, Selections *selections, flag
         // Otherwise assume the smallest possible record length and assure that enough
         // data is present.
         else {
-            if (offset + 256 > buflen) {
+            if (offset + MINRECLEN > buflen) {
                 ms_log(1, "readMSEEDBuffer(): Last record only has %i byte(s) which "
                           "is not enough to constitute a full SEED record. Corrupt data? "
                           "Record will be skipped.\n", buflen - offset);
@@ -277,6 +303,12 @@ readMSEEDBuffer (char *mseed, int buflen, Selections *selections, flag
             }
         }
 
+        // Skip empty or noise records.
+        if (OBSPY_ISVALIDBLANK(mseed + offset)) {
+            offset += MINRECLEN;
+            continue;
+        }
+
         // Pass (buflen - offset) because msr_parse() expects only a single record. This
         // way libmseed can take care to not overstep bounds.
         retcode = msr_parse ( (mseed+offset), buflen - offset, &msr, reclen, dataflag, verbose);

diff --git a/obspy/io/mseed/tests/data/single_record_plus_noise_record.mseed b/obspy/io/mseed/tests/data/single_record_plus_noise_record.mseed
diff --git a/obspy/io/mseed/tests/data/various_noise_records.mseed b/obspy/io/mseed/tests/data/various_noise_records.mseed
diff --git a/obspy/io/mseed/tests/test_mseed_reading_and_writing.py b/obspy/io/mseed/tests/test_mseed_reading_and_writing.py
@@ -17,9 +17,8 @@
 from obspy import Stream, Trace, UTCDateTime, read
 from obspy.core import AttribDict
 from obspy.core.util import CatchOutput, NamedTemporaryFile
-from obspy.io.mseed import util
-from obspy.io.mseed.core import _is_mseed, _read_mseed, _write_mseed, \
-    InternalMSEEDReadingWarning
+from obspy.io.mseed import util, InternalMSEEDReadingWarning
+from obspy.io.mseed.core import _is_mseed, _read_mseed, _write_mseed
 from obspy.io.mseed.headers import ENCODINGS, clibmseed
 from obspy.io.mseed.msstruct import _MSStruct
 
@@ -228,10 +227,17 @@ def test_is_mseed(self):
         # Mini-SEED file names.
         mseed_filenames = ['BW.BGLD.__.EHE.D.2008.001.first_10_records',
                            'gaps.mseed', 'qualityflags.mseed', 'test.mseed',
-                           'timingquality.mseed']
+                           'timingquality.mseed', 'blockette008.mseed',
+                           'fullseed.mseed', 'various_noise_records.mseed']
+
         # Non Mini-SEED file names.
         non_mseed_filenames = ['test_mseed_reading_and_writing.py',
-                               '__init__.py']
+                               '__init__.py',
+                               os.path.join('data', 'not.mseed'),
+                               os.path.join('data', 'not2.mseed'),
+                               os.path.join('data', 'not3.mseed'),
+                               os.path.join('data', 'not4.mseed')]
+
         # Loop over Mini-SEED files
         for _i in mseed_filenames:
             filename = os.path.join(self.path, 'data', _i)
@@ -243,6 +249,34 @@ def test_is_mseed(self):
             is_mseed = _is_mseed(filename)
             self.assertFalse(is_mseed)
 
+        # Also test it from an open file.
+        for _i in mseed_filenames:
+            filename = os.path.join(self.path, 'data', _i)
+            with io.open(filename, "rb") as fh:
+                is_mseed = _is_mseed(fh)
+            self.assertTrue(is_mseed)
+        for _i in non_mseed_filenames:
+            filename = os.path.join(self.path, _i)
+            with io.open(filename, "rb") as fh:
+                is_mseed = _is_mseed(fh)
+            self.assertFalse(is_mseed)
+
+        # And from a BytesIO.
+        for _i in mseed_filenames:
+            filename = os.path.join(self.path, 'data', _i)
+            with io.open(filename, "rb") as fh:
+                with io.BytesIO(fh.read()) as buf:
+                    buf.seek(0, 0)
+                    is_mseed = _is_mseed(buf)
+            self.assertTrue(is_mseed)
+        for _i in non_mseed_filenames:
+            filename = os.path.join(self.path, _i)
+            with io.open(filename, "rb") as fh:
+                with io.BytesIO(fh.read()) as buf:
+                    buf.seek(0, 0)
+                    is_mseed = _is_mseed(buf)
+            self.assertFalse(is_mseed)
+
     def test_read_single_record_to_msr(self):
         """
         Tests readSingleRecordtoMSR against start and end times.
@@ -280,6 +314,34 @@ def test_read_file_via_mseed(self):
         for _i in range(5):
             self.assertEqual(stream[0].data[_i], data[_i])
 
+        # Make sure it can also read from open files.
+        with io.open(testfile, "rb") as fh:
+            stream = _read_mseed(fh)
+        stream.verify()
+        self.assertEqual(stream[0].stats.network, 'NL')
+        self.assertEqual(stream[0].stats['station'], 'HGN')
+        self.assertEqual(stream[0].stats.get('location'), '00')
+        self.assertEqual(stream[0].stats.npts, 11947)
+        self.assertEqual(stream[0].stats['sampling_rate'], 40.0)
+        self.assertEqual(stream[0].stats.get('channel'), 'BHZ')
+        for _i in range(5):
+            self.assertEqual(stream[0].data[_i], data[_i])
+
+        # And from BytesIO.
+        with io.open(testfile, "rb") as fh:
+            with io.BytesIO(fh.read()) as buf:
+                buf.seek(0, 0)
+                stream = _read_mseed(buf)
+        stream.verify()
+        self.assertEqual(stream[0].stats.network, 'NL')
+        self.assertEqual(stream[0].stats['station'], 'HGN')
+        self.assertEqual(stream[0].stats.get('location'), '00')
+        self.assertEqual(stream[0].stats.npts, 11947)
+        self.assertEqual(stream[0].stats['sampling_rate'], 40.0)
+        self.assertEqual(stream[0].stats.get('channel'), 'BHZ')
+        for _i in range(5):
+            self.assertEqual(stream[0].data[_i], data[_i])
+
     def test_read_partial_time_window_from_file(self):
         """
         Uses obspy.io.mseed.mseed._read_mseed to read only read a certain time
@@ -917,6 +979,7 @@ def test_is_valid_mseed(self):
         # fullseed starting with blockette 010
         file = os.path.join(self.path, 'data', 'fullseed.mseed')
         self.assertTrue(_is_mseed(file))
+        return
         # fullseed starting with blockette 008
         file = os.path.join(self.path, 'data', 'blockette008.mseed')
         self.assertTrue(_is_mseed(file))