Skip to content

Commit

Permalink
add gzip and file stream compatibility [#2 & #5]
Browse files Browse the repository at this point in the history
  • Loading branch information
mbhall88 committed Feb 1, 2021
1 parent ad9cb50 commit d8e0b5f
Show file tree
Hide file tree
Showing 7 changed files with 257 additions and 20 deletions.
78 changes: 63 additions & 15 deletions pafpy/paffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,36 @@
from pafpy import PafFile
```
"""
import gzip
import io
import os
import sys
from pathlib import Path
from typing import Optional, TextIO, Union
from typing import IO, Optional, TextIO, Union

from pafpy.pafrecord import PafRecord
from pafpy.utils import is_compressed

PathLike = Union[Path, str, os.PathLike]


class PafFile:
"""Stream access to a PAF file.
The file is *not* automatically opened. After construction, it can be opened in
one of two ways:
`fileobj` is an object to read the PAF file from. Can be a `str`, `pathlib.Path`,
or an opened file ([file object](https://docs.python.org/3/glossary.html#term-file-object)).
> *Note: to use stdin, pass `fileobj="-"`*. See the usage docs for more details.
The file is *not* automatically opened - unless already open. After construction,
it can be opened in one of two ways:
1. Manually, with `PafFile.open`. Remember to close the file when finished.
2. Via a context manager (`with`) block.
If an already-open `fileobj` is given, the `PafFile` can be iterated without the
need to open it.
## Example
```py
from pafpy import PafFile, PafRecord
Expand Down Expand Up @@ -58,10 +70,16 @@ class PafFile:
`pafpy.pafrecord.PafRecord` objects.
"""

def __init__(self, path: PathLike):
self.path = Path(path)
"""Path to the PAF file. Can be a `str` or a `pathlib.Path` object."""
self._stream: Optional[TextIO] = None
def __init__(self, fileobj: Union[PathLike, IO]):
if isinstance(fileobj, io.IOBase):
self._stream = fileobj
self.path = None
self._is_stdin = False
else:
self._stream: Optional[TextIO] = None
self.path = Path(fileobj) if not str(fileobj) == "-" else None
self._is_stdin = self.path is None
"""Path to the PAF file. If `fileobj` is an open file object, then `path` will be `None` ."""

def __del__(self):
self.close()
Expand All @@ -76,9 +94,32 @@ def __iter__(self):
return self

def __next__(self) -> PafRecord:
if self.closed:
if self.closed and self._is_stdin:
self.open()
elif self.closed:
raise IOError("PAF file is closed - cannot get next element.")
return PafRecord.from_str(next(self._stream))
line = next(self._stream)
if isinstance(line, bytes):
line = line.decode()
return PafRecord.from_str(line)

def _open(self) -> IO:
if self.path is not None:
with open(self.path, mode="rb") as fileobj:
file_is_compressed = is_compressed(fileobj)

if file_is_compressed:
return gzip.open(self.path)
else:
return open(self.path)
elif self._is_stdin:
return (
sys.stdin.buffer
if not is_compressed(sys.stdin.buffer)
else gzip.open(sys.stdin.buffer)
)
else:
return self._stream

def open(self) -> "PafFile":
"""Opens the PAF file to allow iterating over the records. Returns a `PafFile`
Expand All @@ -100,22 +141,29 @@ def open(self) -> "PafFile":
assert paf.closed
```
> *Note: If the file is already open, the file position will be reset to the
beginning.*
## Errors
- If the file is already open, an `IOError` exception is raised.
- If `path` does not exist, an `OSError` exception is raised.
"""
if not self.closed:
raise IOError("PafFile is already open.")
self._stream = self.path.open()
self._stream.seek(0)
else:
self._stream = self._open()
return self

@property
def closed(self) -> bool:
"""Is the PAF file closed?"""
return self._stream is None or self._stream.closed
return self._stream is None

def close(self):
"""Close the `PafFile`."""
if not self.closed:
self._stream.close()
self._stream = None
try:
self._stream.close()
except AttributeError: # happens if stream is stdin
pass
finally:
self._stream = None
14 changes: 14 additions & 0 deletions pafpy/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import IO

GZIP_MAGIC = b"\x1f\x8b"


def first_n_bytes(fileobj: IO, n: int = 2) -> bytes:
n_bytes = fileobj.read(n)
fileobj.seek(0)
return n_bytes if isinstance(n_bytes, bytes) else n_bytes.encode()


def is_compressed(fileobj: IO) -> bool:
n_bytes = first_n_bytes(fileobj)
return n_bytes == GZIP_MAGIC
1 change: 1 addition & 0 deletions tests/demo.paf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
11737-1 294 89 294 - 31171-1 292 87 292 205 205 0 NM:i:0 ms:i:410 AS:i:410 nn:i:0 tp:A:S cm:i:37 s1:i:193 de:f:0 rl:i:91 cg:Z:205M
Binary file added tests/demo.paf.gz
Binary file not shown.
Empty file added tests/stream.py
Empty file.
89 changes: 84 additions & 5 deletions tests/test_paffile.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
import tempfile
from pathlib import Path

Expand All @@ -6,6 +7,32 @@
from pafpy.paffile import PafFile
from pafpy.pafrecord import PafRecord

TEST_DIR = Path(__file__).parent


class TestConstructor:
def test_fileobj_is_dash_uses_stdin(self):
fileobj = "-"
paf = PafFile(fileobj)

assert paf._stream is None
assert paf.path is None
assert paf._is_stdin

def test_fileobj_is_path(self):
fileobj = "path/to/file"
paf = PafFile(fileobj)

assert paf.path == Path(fileobj)
assert paf.closed

def test_file_object_given(self):
with tempfile.TemporaryFile() as fileobj:
paf = PafFile(fileobj)

assert paf.path is None
assert not paf.closed


class TestClosed:
def test_file_is_closed_returns_true(self):
Expand Down Expand Up @@ -37,15 +64,19 @@ def test_file_does_not_exist_raises_error(self):
with pytest.raises(OSError):
paf.open()

def test_file_object_already_open_returns_itself(self):
fileobj = open(__file__, mode="rb")
paf = PafFile(fileobj)
assert paf.open()._stream == paf._stream


class TestContextManager:
def test_file_already_open_raises_error(self):
def test_file_already_open_resets_position(self):
path = __file__
paf = PafFile(path).open()

with pytest.raises(IOError):
with paf:
pass
with paf:
assert paf._stream.tell() == 0

def test_file_not_open_opens_file_in_context_and_closes_afterwards(self):
path = __file__
Expand Down Expand Up @@ -93,6 +124,54 @@ def test_for_loop_returns_pafrecords(self):
assert actual == expected

def test_call_next_on_closed_file_raises_error(self):
paf = PafFile(path="foo")
paf = PafFile(fileobj="foo")
with pytest.raises(IOError):
next(paf)


class TestIO:
def test_read_gzip_compressed(self):
path = TEST_DIR / "demo.paf.gz"
with PafFile(path) as paf:
record = next(paf)

assert record.qname == "11737-1"

def test_read_normal_file(self):
path = TEST_DIR / "demo.paf"
with PafFile(path) as paf:
record = next(paf)

assert record.qname == "11737-1"

def test_read_from_fileobj(self):
path = TEST_DIR / "demo.paf"
with open(path) as fileobj:
paf = PafFile(fileobj)
record = next(paf)

assert record.qname == "11737-1"

def test_read_from_stdin(self, monkeypatch):
fields = [
"query_name",
"1239",
"65",
"1239",
"+",
"target_name",
"4378340",
"2555250",
"2556472",
"1139",
"1228",
"60",
]
line = "\t".join(fields).encode()
fileobj = io.TextIOWrapper(io.BytesIO(line))
monkeypatch.setattr("sys.stdin", fileobj)
path = "-"
paf = PafFile(path)
record = next(paf)

assert record.qname == fields[0]
95 changes: 95 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from tempfile import TemporaryFile

import pytest

from pafpy.utils import GZIP_MAGIC, first_n_bytes, is_compressed


class TestFirstNBytes:
def test_empty_file_returns_empty(self):
contents = b""
with TemporaryFile() as fileobj:
fileobj.write(contents)
fileobj.seek(0)
actual = first_n_bytes(fileobj)
expected = contents

assert actual == expected

def test_one_byte_returns_one_byte(self):
contents = b"1"
n = 2
with TemporaryFile() as fileobj:
fileobj.write(contents)
fileobj.seek(0)
actual = first_n_bytes(fileobj, n=n)
expected = contents

assert actual == expected

def test_two_bytes_returns_two_bytes(self):
contents = b"12"
n = 2

with TemporaryFile() as fileobj:
fileobj.write(contents)
fileobj.seek(0)
actual = first_n_bytes(fileobj, n=n)
expected = contents

assert actual == expected

def test_more_bytes_returns_two_bytes(self):
contents = b"12345"
n = 2

with TemporaryFile() as fileobj:
fileobj.write(contents)
fileobj.seek(0)
actual = first_n_bytes(fileobj, n=n)
expected = contents[:n]

assert actual == expected

def test_text_stream_returns_bytes(self):
contents = "12345"
n = 3

with TemporaryFile("w+") as fileobj:
fileobj.write(contents)
fileobj.seek(0)
actual = first_n_bytes(fileobj, n=n)
expected = contents[:n].encode()

assert actual == expected

def test_nonreadable_object_raises_error(self):
fileobj = b"12345"
n = 3

with pytest.raises(AttributeError) as err:
first_n_bytes(fileobj, n=n)
assert err.match("has no attribute 'read'")


class TestIsCompressed:
def test_empty_file(self):
contents = b""
with TemporaryFile() as fileobj:
fileobj.write(contents)
fileobj.seek(0)
assert not is_compressed(fileobj)

def test_non_compressed(self):
contents = b"not compressed"
with TemporaryFile() as fileobj:
fileobj.write(contents)
fileobj.seek(0)
assert not is_compressed(fileobj)

def test_compressed(self):
contents = GZIP_MAGIC + b" is compressed"
with TemporaryFile() as fileobj:
fileobj.write(contents)
fileobj.seek(0)
assert is_compressed(fileobj)

0 comments on commit d8e0b5f

Please sign in to comment.