diff --git a/pafpy/paffile.py b/pafpy/paffile.py index fecf1b4..781fbc3 100644 --- a/pafpy/paffile.py +++ b/pafpy/paffile.py @@ -9,11 +9,15 @@ from pafpy import PafFile ``` """ +import gzip +import io import os +import sys from pathlib import Path -from typing import Optional, TextIO, Union +from typing import IO, Optional, TextIO, Union from pafpy.pafrecord import PafRecord +from pafpy.utils import is_compressed PathLike = Union[Path, str, os.PathLike] @@ -21,12 +25,20 @@ class PafFile: """Stream access to a PAF file. - The file is *not* automatically opened. After construction, it can be opened in - one of two ways: + `fileobj` is an object to read the PAF file from. Can be a `str`, `pathlib.Path`, + or an opened file ([file object](https://docs.python.org/3/glossary.html#term-file-object)). + + > *Note: to use stdin, pass `fileobj="-"`*. See the usage docs for more details. + + The file is *not* automatically opened - unless already open. After construction, + it can be opened in one of two ways: 1. Manually, with `PafFile.open`. Remember to close the file when finished. 2. Via a context manager (`with`) block. + If an already-open `fileobj` is given, the `PafFile` can be iterated without the + need to open it. + ## Example ```py from pafpy import PafFile, PafRecord @@ -58,10 +70,16 @@ class PafFile: `pafpy.pafrecord.PafRecord` objects. """ - def __init__(self, path: PathLike): - self.path = Path(path) - """Path to the PAF file. Can be a `str` or a `pathlib.Path` object.""" - self._stream: Optional[TextIO] = None + def __init__(self, fileobj: Union[PathLike, IO]): + if isinstance(fileobj, io.IOBase): + self._stream = fileobj + self.path = None + self._is_stdin = False + else: + self._stream: Optional[TextIO] = None + self.path = Path(fileobj) if not str(fileobj) == "-" else None + self._is_stdin = self.path is None + """Path to the PAF file. If `fileobj` is an open file object, then `path` will be `None` .""" def __del__(self): self.close() @@ -76,9 +94,32 @@ def __iter__(self): return self def __next__(self) -> PafRecord: - if self.closed: + if self.closed and self._is_stdin: + self.open() + elif self.closed: raise IOError("PAF file is closed - cannot get next element.") - return PafRecord.from_str(next(self._stream)) + line = next(self._stream) + if isinstance(line, bytes): + line = line.decode() + return PafRecord.from_str(line) + + def _open(self) -> IO: + if self.path is not None: + with open(self.path, mode="rb") as fileobj: + file_is_compressed = is_compressed(fileobj) + + if file_is_compressed: + return gzip.open(self.path) + else: + return open(self.path) + elif self._is_stdin: + return ( + sys.stdin.buffer + if not is_compressed(sys.stdin.buffer) + else gzip.open(sys.stdin.buffer) + ) + else: + return self._stream def open(self) -> "PafFile": """Opens the PAF file to allow iterating over the records. Returns a `PafFile` @@ -100,22 +141,29 @@ def open(self) -> "PafFile": assert paf.closed ``` + > *Note: If the file is already open, the file position will be reset to the + beginning.* + ## Errors - - If the file is already open, an `IOError` exception is raised. - If `path` does not exist, an `OSError` exception is raised. """ if not self.closed: - raise IOError("PafFile is already open.") - self._stream = self.path.open() + self._stream.seek(0) + else: + self._stream = self._open() return self @property def closed(self) -> bool: """Is the PAF file closed?""" - return self._stream is None or self._stream.closed + return self._stream is None def close(self): """Close the `PafFile`.""" if not self.closed: - self._stream.close() - self._stream = None + try: + self._stream.close() + except AttributeError: # happens if stream is stdin + pass + finally: + self._stream = None diff --git a/pafpy/utils.py b/pafpy/utils.py new file mode 100644 index 0000000..0aecd75 --- /dev/null +++ b/pafpy/utils.py @@ -0,0 +1,14 @@ +from typing import IO + +GZIP_MAGIC = b"\x1f\x8b" + + +def first_n_bytes(fileobj: IO, n: int = 2) -> bytes: + n_bytes = fileobj.read(n) + fileobj.seek(0) + return n_bytes if isinstance(n_bytes, bytes) else n_bytes.encode() + + +def is_compressed(fileobj: IO) -> bool: + n_bytes = first_n_bytes(fileobj) + return n_bytes == GZIP_MAGIC diff --git a/tests/demo.paf b/tests/demo.paf new file mode 100644 index 0000000..e36479d --- /dev/null +++ b/tests/demo.paf @@ -0,0 +1 @@ +11737-1 294 89 294 - 31171-1 292 87 292 205 205 0 NM:i:0 ms:i:410 AS:i:410 nn:i:0 tp:A:S cm:i:37 s1:i:193 de:f:0 rl:i:91 cg:Z:205M diff --git a/tests/demo.paf.gz b/tests/demo.paf.gz new file mode 100644 index 0000000..15184f5 Binary files /dev/null and b/tests/demo.paf.gz differ diff --git a/tests/stream.py b/tests/stream.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_paffile.py b/tests/test_paffile.py index e363440..6a03100 100644 --- a/tests/test_paffile.py +++ b/tests/test_paffile.py @@ -1,3 +1,4 @@ +import io import tempfile from pathlib import Path @@ -6,6 +7,32 @@ from pafpy.paffile import PafFile from pafpy.pafrecord import PafRecord +TEST_DIR = Path(__file__).parent + + +class TestConstructor: + def test_fileobj_is_dash_uses_stdin(self): + fileobj = "-" + paf = PafFile(fileobj) + + assert paf._stream is None + assert paf.path is None + assert paf._is_stdin + + def test_fileobj_is_path(self): + fileobj = "path/to/file" + paf = PafFile(fileobj) + + assert paf.path == Path(fileobj) + assert paf.closed + + def test_file_object_given(self): + with tempfile.TemporaryFile() as fileobj: + paf = PafFile(fileobj) + + assert paf.path is None + assert not paf.closed + class TestClosed: def test_file_is_closed_returns_true(self): @@ -37,15 +64,19 @@ def test_file_does_not_exist_raises_error(self): with pytest.raises(OSError): paf.open() + def test_file_object_already_open_returns_itself(self): + fileobj = open(__file__, mode="rb") + paf = PafFile(fileobj) + assert paf.open()._stream == paf._stream + class TestContextManager: - def test_file_already_open_raises_error(self): + def test_file_already_open_resets_position(self): path = __file__ paf = PafFile(path).open() - with pytest.raises(IOError): - with paf: - pass + with paf: + assert paf._stream.tell() == 0 def test_file_not_open_opens_file_in_context_and_closes_afterwards(self): path = __file__ @@ -93,6 +124,54 @@ def test_for_loop_returns_pafrecords(self): assert actual == expected def test_call_next_on_closed_file_raises_error(self): - paf = PafFile(path="foo") + paf = PafFile(fileobj="foo") with pytest.raises(IOError): next(paf) + + +class TestIO: + def test_read_gzip_compressed(self): + path = TEST_DIR / "demo.paf.gz" + with PafFile(path) as paf: + record = next(paf) + + assert record.qname == "11737-1" + + def test_read_normal_file(self): + path = TEST_DIR / "demo.paf" + with PafFile(path) as paf: + record = next(paf) + + assert record.qname == "11737-1" + + def test_read_from_fileobj(self): + path = TEST_DIR / "demo.paf" + with open(path) as fileobj: + paf = PafFile(fileobj) + record = next(paf) + + assert record.qname == "11737-1" + + def test_read_from_stdin(self, monkeypatch): + fields = [ + "query_name", + "1239", + "65", + "1239", + "+", + "target_name", + "4378340", + "2555250", + "2556472", + "1139", + "1228", + "60", + ] + line = "\t".join(fields).encode() + fileobj = io.TextIOWrapper(io.BytesIO(line)) + monkeypatch.setattr("sys.stdin", fileobj) + path = "-" + paf = PafFile(path) + record = next(paf) + + assert record.qname == fields[0] diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..0f8cabe --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,95 @@ +from tempfile import TemporaryFile + +import pytest + +from pafpy.utils import GZIP_MAGIC, first_n_bytes, is_compressed + + +class TestFirstNBytes: + def test_empty_file_returns_empty(self): + contents = b"" + with TemporaryFile() as fileobj: + fileobj.write(contents) + fileobj.seek(0) + actual = first_n_bytes(fileobj) + expected = contents + + assert actual == expected + + def test_one_byte_returns_one_byte(self): + contents = b"1" + n = 2 + with TemporaryFile() as fileobj: + fileobj.write(contents) + fileobj.seek(0) + actual = first_n_bytes(fileobj, n=n) + expected = contents + + assert actual == expected + + def test_two_bytes_returns_two_bytes(self): + contents = b"12" + n = 2 + + with TemporaryFile() as fileobj: + fileobj.write(contents) + fileobj.seek(0) + actual = first_n_bytes(fileobj, n=n) + expected = contents + + assert actual == expected + + def test_more_bytes_returns_two_bytes(self): + contents = b"12345" + n = 2 + + with TemporaryFile() as fileobj: + fileobj.write(contents) + fileobj.seek(0) + actual = first_n_bytes(fileobj, n=n) + expected = contents[:n] + + assert actual == expected + + def test_text_stream_returns_bytes(self): + contents = "12345" + n = 3 + + with TemporaryFile("w+") as fileobj: + fileobj.write(contents) + fileobj.seek(0) + actual = first_n_bytes(fileobj, n=n) + expected = contents[:n].encode() + + assert actual == expected + + def test_nonreadable_object_raises_error(self): + fileobj = b"12345" + n = 3 + + with pytest.raises(AttributeError) as err: + first_n_bytes(fileobj, n=n) + assert err.match("has no attribute 'read'") + + +class TestIsCompressed: + def test_empty_file(self): + contents = b"" + with TemporaryFile() as fileobj: + fileobj.write(contents) + fileobj.seek(0) + assert not is_compressed(fileobj) + + def test_non_compressed(self): + contents = b"not compressed" + with TemporaryFile() as fileobj: + fileobj.write(contents) + fileobj.seek(0) + assert not is_compressed(fileobj) + + def test_compressed(self): + contents = GZIP_MAGIC + b" is compressed" + with TemporaryFile() as fileobj: + fileobj.write(contents) + fileobj.seek(0) + assert is_compressed(fileobj)