From d8e0b5fef03191aa142467d890529afebd66b688 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Mon, 1 Feb 2021 12:16:15 +1000 Subject: [PATCH] add gzip and file stream compatibility [#2 & #5] --- pafpy/paffile.py | 78 +++++++++++++++++++++++++++------- pafpy/utils.py | 14 +++++++ tests/demo.paf | 1 + tests/demo.paf.gz | Bin 0 -> 126 bytes tests/stream.py | 0 tests/test_paffile.py | 89 ++++++++++++++++++++++++++++++++++++--- tests/test_utils.py | 95 ++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 257 insertions(+), 20 deletions(-) create mode 100644 pafpy/utils.py create mode 100644 tests/demo.paf create mode 100644 tests/demo.paf.gz create mode 100644 tests/stream.py create mode 100644 tests/test_utils.py diff --git a/pafpy/paffile.py b/pafpy/paffile.py index fecf1b4..781fbc3 100644 --- a/pafpy/paffile.py +++ b/pafpy/paffile.py @@ -9,11 +9,15 @@ from pafpy import PafFile ``` """ +import gzip +import io import os +import sys from pathlib import Path -from typing import Optional, TextIO, Union +from typing import IO, Optional, TextIO, Union from pafpy.pafrecord import PafRecord +from pafpy.utils import is_compressed PathLike = Union[Path, str, os.PathLike] @@ -21,12 +25,20 @@ class PafFile: """Stream access to a PAF file. - The file is *not* automatically opened. After construction, it can be opened in - one of two ways: + `fileobj` is an object to read the PAF file from. Can be a `str`, `pathlib.Path`, + or an opened file ([file object](https://docs.python.org/3/glossary.html#term-file-object)). + + > *Note: to use stdin, pass `fileobj="-"`*. See the usage docs for more details. + + The file is *not* automatically opened - unless already open. After construction, + it can be opened in one of two ways: 1. Manually, with `PafFile.open`. Remember to close the file when finished. 2. Via a context manager (`with`) block. + If an already-open `fileobj` is given, the `PafFile` can be iterated without the + need to open it. + ## Example ```py from pafpy import PafFile, PafRecord @@ -58,10 +70,16 @@ class PafFile: `pafpy.pafrecord.PafRecord` objects. """ - def __init__(self, path: PathLike): - self.path = Path(path) - """Path to the PAF file. Can be a `str` or a `pathlib.Path` object.""" - self._stream: Optional[TextIO] = None + def __init__(self, fileobj: Union[PathLike, IO]): + if isinstance(fileobj, io.IOBase): + self._stream = fileobj + self.path = None + self._is_stdin = False + else: + self._stream: Optional[TextIO] = None + self.path = Path(fileobj) if not str(fileobj) == "-" else None + self._is_stdin = self.path is None + """Path to the PAF file. If `fileobj` is an open file object, then `path` will be `None` .""" def __del__(self): self.close() @@ -76,9 +94,32 @@ def __iter__(self): return self def __next__(self) -> PafRecord: - if self.closed: + if self.closed and self._is_stdin: + self.open() + elif self.closed: raise IOError("PAF file is closed - cannot get next element.") - return PafRecord.from_str(next(self._stream)) + line = next(self._stream) + if isinstance(line, bytes): + line = line.decode() + return PafRecord.from_str(line) + + def _open(self) -> IO: + if self.path is not None: + with open(self.path, mode="rb") as fileobj: + file_is_compressed = is_compressed(fileobj) + + if file_is_compressed: + return gzip.open(self.path) + else: + return open(self.path) + elif self._is_stdin: + return ( + sys.stdin.buffer + if not is_compressed(sys.stdin.buffer) + else gzip.open(sys.stdin.buffer) + ) + else: + return self._stream def open(self) -> "PafFile": """Opens the PAF file to allow iterating over the records. Returns a `PafFile` @@ -100,22 +141,29 @@ def open(self) -> "PafFile": assert paf.closed ``` + > *Note: If the file is already open, the file position will be reset to the + beginning.* + ## Errors - - If the file is already open, an `IOError` exception is raised. - If `path` does not exist, an `OSError` exception is raised. """ if not self.closed: - raise IOError("PafFile is already open.") - self._stream = self.path.open() + self._stream.seek(0) + else: + self._stream = self._open() return self @property def closed(self) -> bool: """Is the PAF file closed?""" - return self._stream is None or self._stream.closed + return self._stream is None def close(self): """Close the `PafFile`.""" if not self.closed: - self._stream.close() - self._stream = None + try: + self._stream.close() + except AttributeError: # happens if stream is stdin + pass + finally: + self._stream = None diff --git a/pafpy/utils.py b/pafpy/utils.py new file mode 100644 index 0000000..0aecd75 --- /dev/null +++ b/pafpy/utils.py @@ -0,0 +1,14 @@ +from typing import IO + +GZIP_MAGIC = b"\x1f\x8b" + + +def first_n_bytes(fileobj: IO, n: int = 2) -> bytes: + n_bytes = fileobj.read(n) + fileobj.seek(0) + return n_bytes if isinstance(n_bytes, bytes) else n_bytes.encode() + + +def is_compressed(fileobj: IO) -> bool: + n_bytes = first_n_bytes(fileobj) + return n_bytes == GZIP_MAGIC diff --git a/tests/demo.paf b/tests/demo.paf new file mode 100644 index 0000000..e36479d --- /dev/null +++ b/tests/demo.paf @@ -0,0 +1 @@ +11737-1 294 89 294 - 31171-1 292 87 292 205 205 0 NM:i:0 ms:i:410 AS:i:410 nn:i:0 tp:A:S cm:i:37 s1:i:193 de:f:0 rl:i:91 cg:Z:205M diff --git a/tests/demo.paf.gz b/tests/demo.paf.gz new file mode 100644 index 0000000000000000000000000000000000000000..15184f5e011c302ccbbaa5d7e726e1357e90e8ed GIT binary patch literal 126 zcmV-^0D=D>iwFqOA`M^w17u}wZ!U0QW&kaWy$Xad5QOWwcM&p)vSvFUKpQ(-y^{kI z