Skip to content

Commit

Permalink
Use shared function for Content-Range parsing
Browse files Browse the repository at this point in the history
It turns out that HttpResourceHandle was also doing some parsing of the content-range header, so pull out a function that will work for both this and HttpResourcePath.size().
  • Loading branch information
dhirving committed Jan 17, 2024
1 parent 2d7dd50 commit 7fb06ee
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 45 deletions.
78 changes: 66 additions & 12 deletions python/lsst/resources/_resourceHandles/_httpResourceHandle.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
__all__ = ("HttpReadResourceHandle",)

import io
import re
from collections.abc import Callable, Iterable
from logging import Logger
from typing import AnyStr
from typing import AnyStr, NamedTuple

import requests
from lsst.utils.timer import time_this
Expand Down Expand Up @@ -203,17 +204,13 @@ def read(self, size: int = -1) -> bytes:
# in the file and also the current position we have got to in the
# server.
if "Content-Range" in resp.headers:
content_range = resp.headers["Content-Range"]
units, range_string = content_range.split(" ")
if units == "bytes":
range, total = range_string.split("/")
if "-" in range:
_, end = range.split("-")
end_pos = int(end)
if total != "*" and end_pos >= int(total) - 1:
self._eof = True
else:
self._log.warning("Requested byte range from server but instead got: %s", content_range)
content_range = parse_content_range_header(resp.headers["Content-Range"])
if (
content_range.total is not None
and content_range.range_end is not None
and content_range.range_end >= content_range.total - 1
):
self._eof = True

# Try to guess that we overran the end. This will not help if we
# read exactly the number of bytes to get us to the end and so we
Expand All @@ -223,3 +220,60 @@ def read(self, size: int = -1) -> bytes:

self._current_position += len_content
return resp.content


class ContentRange(NamedTuple):
"""Represents the data in an HTTP Content-Range header."""

range_start: int | None
"""First byte of the zero-indexed, inclusive range returned by this
response `None` if the range was not available in the header.
"""
range_end: int | None
"""Last byte of the zero-indexed, inclusive range returned by this
response. `None` if the range was not available in the header.
"""
total: int | None
"""Total size of the file in bytes. `None` if the file size was not
available in the header.
"""


def parse_content_range_header(header: str) -> ContentRange:
"""Parse an HTTP 'Content-Range' header.
Parameters
----------
header : `str`
Value of an HTTP Content-Range header to be parsed.
Returns
-------
content_range : `ContentRange`
The byte range included in the response and the total file size.
Raises
------
ValueError
If the header was not in the expected format.
"""
# There are three possible formats for Content-Range. All of them start
# with optional whitespace and a unit, which for our purposes should always
# be "bytes".
prefix = r"^\s*bytes\s+"

# Content-Range: <unit> <range-start>-<range-end>/<size>
if (case1 := re.match(prefix + r"(\d+)-(\d+)/(\d+)", header)) is not None:
return ContentRange(
range_start=int(case1.group(1)), range_end=int(case1.group(2)), total=int(case1.group(3))
)

# Content-Range: <unit> <range-start>-<range-end>/*
if (case2 := re.match(prefix + r"(\d+)-(\d+)/\*", header)) is not None:
return ContentRange(range_start=int(case2.group(1)), range_end=int(case2.group(2)), total=None)

# Content-Range: <unit> */<size>
if (case3 := re.match(prefix + r"\*/(\d+)", header)) is not None:
return ContentRange(range_start=None, range_end=None, total=int(case3.group(1)))

raise ValueError(f"Content-Range header in unexpected format: '{header}'")
32 changes: 8 additions & 24 deletions python/lsst/resources/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from urllib3.util.retry import Retry

from ._resourceHandles import ResourceHandleProtocol
from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle
from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle, parse_content_range_header
from ._resourcePath import ResourcePath

if TYPE_CHECKING:
Expand Down Expand Up @@ -828,12 +828,16 @@ def size(self) -> int:
# Content-Length is the length of the Range and not the full
# length of the file, so we have to parse Content-Range
# instead.
content_range = resp.headers.get("Content-Range")
if not content_range:
content_range_header = resp.headers.get("Content-Range")
if content_range_header is None:
raise ValueError(

Check warning on line 833 in python/lsst/resources/http.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/resources/http.py#L833

Added line #L833 was not covered by tests
f"Response to GET request to {self} did not contain 'Content-Range' header"
)
return _get_size_from_content_range_header(content_range)
content_range = parse_content_range_header(content_range_header)
size = content_range.total
if size is None:
raise ValueError(f"Content-Range header for {self} did not include a total file size")

Check warning on line 839 in python/lsst/resources/http.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/resources/http.py#L839

Added line #L839 was not covered by tests
return size

elif resp.status_code == requests.codes.not_found:
raise FileNotFoundError(
Expand Down Expand Up @@ -1766,23 +1770,3 @@ def name(self) -> str:
@property
def href(self) -> str:
return self._href


def _get_size_from_content_range_header(header: str) -> int:
"""Parse an HTTP 'Content-Range' header to determine the total file size.
Returns the size, or throws `ValueError` if the size can not be
determined.
"""
# The following three formats are allowed for Content-Range, where "<size>"
# is the total file size:
# Content-Range: <unit> <range-start>-<range-end>/<size>
# Content-Range: <unit> */<size>
# Content-Range: <unit> <range-start>-<range-end>/*
#
# Only the first two cases are handled by this regex.
# The last case with a '*' for the size is considered an error, because it
# means the size is unknown.
match = re.match(r"^\s*bytes\s+(?:\*|(?:\d+-\d+))/(\d+)", header)
if match is None:
raise ValueError(f"Content-Range header in unexpected format: '{header}'")
return int(match.group(1))
48 changes: 39 additions & 9 deletions tests/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,14 @@
import requests
import responses
from lsst.resources import ResourcePath
from lsst.resources._resourceHandles._httpResourceHandle import HttpReadResourceHandle
from lsst.resources._resourceHandles._httpResourceHandle import (
HttpReadResourceHandle,
parse_content_range_header,
)
from lsst.resources.http import (
BearerTokenAuth,
HttpResourcePathConfig,
SessionStore,
_get_size_from_content_range_header,
_is_protected,
_is_webdav_endpoint,
)
Expand Down Expand Up @@ -884,14 +886,42 @@ def test_sessions(self):
class TestContentRange(unittest.TestCase):
"""Test parsing of Content-Range header."""

def test_content_range_header_parsing(self):
self.assertEqual(_get_size_from_content_range_header("bytes 123-2555/12345"), 12345)
self.assertEqual(_get_size_from_content_range_header(" bytes 0-0/23456 "), 23456)
self.assertEqual(_get_size_from_content_range_header("bytes */5"), 5)
with self.assertRaises(ValueError):
_get_size_from_content_range_header("bytes 0-10/*")
def test_full_data(self):
parsed = parse_content_range_header("bytes 123-2555/12345")
self.assertEqual(parsed.range_start, 123)
self.assertEqual(parsed.range_end, 2555)
self.assertEqual(parsed.total, 12345)

parsed = parse_content_range_header(" bytes 0-0/5 ")
self.assertEqual(parsed.range_start, 0)
self.assertEqual(parsed.range_end, 0)
self.assertEqual(parsed.total, 5)

def test_empty_total(self):
parsed = parse_content_range_header("bytes 123-2555/*")
self.assertEqual(parsed.range_start, 123)
self.assertEqual(parsed.range_end, 2555)
self.assertIsNone(parsed.total)

parsed = parse_content_range_header(" bytes 0-0/* ")
self.assertEqual(parsed.range_start, 0)
self.assertEqual(parsed.range_end, 0)
self.assertIsNone(parsed.total)

def test_empty_range(self):
parsed = parse_content_range_header("bytes */12345")
self.assertIsNone(parsed.range_start)
self.assertIsNone(parsed.range_end)
self.assertEqual(parsed.total, 12345)

parsed = parse_content_range_header(" bytes */5 ")
self.assertIsNone(parsed.range_start)
self.assertIsNone(parsed.range_end)
self.assertEqual(parsed.total, 5)

def test_invalid_input(self):
with self.assertRaises(ValueError):
_get_size_from_content_range_header("pages 0-10/12")
parse_content_range_header("pages 0-10/12")


if __name__ == "__main__":
Expand Down

0 comments on commit 7fb06ee

Please sign in to comment.