Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-42522: Implement exists() and size() for S3 presigned URLs #77

Merged
merged 3 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/changes/DM-42522.bugfix.rst
@@ -0,0 +1 @@
``HttpResourcePath.exists()`` and ``HttpResourcePath.size()`` now work for S3 HTTP URLs pre-signed for GET.
78 changes: 66 additions & 12 deletions python/lsst/resources/_resourceHandles/_httpResourceHandle.py
Expand Up @@ -14,9 +14,10 @@
__all__ = ("HttpReadResourceHandle",)

import io
import re
from collections.abc import Callable, Iterable
from logging import Logger
from typing import AnyStr
from typing import AnyStr, NamedTuple

import requests
from lsst.utils.timer import time_this
Expand Down Expand Up @@ -203,17 +204,13 @@ def read(self, size: int = -1) -> bytes:
# in the file and also the current position we have got to in the
# server.
if "Content-Range" in resp.headers:
content_range = resp.headers["Content-Range"]
units, range_string = content_range.split(" ")
if units == "bytes":
range, total = range_string.split("/")
if "-" in range:
_, end = range.split("-")
end_pos = int(end)
if total != "*" and end_pos >= int(total) - 1:
self._eof = True
else:
self._log.warning("Requested byte range from server but instead got: %s", content_range)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a slight behavior change here. Previously if the units were anything but bytes it would only log a warning. Now it throws an exception. This should never happen in real life, and if it does the returned data is almost certainly wrong so we want to bail anyway.

content_range = parse_content_range_header(resp.headers["Content-Range"])
if (
content_range.total is not None
and content_range.range_end is not None
and content_range.range_end >= content_range.total - 1
):
self._eof = True

# Try to guess that we overran the end. This will not help if we
# read exactly the number of bytes to get us to the end and so we
Expand All @@ -223,3 +220,60 @@ def read(self, size: int = -1) -> bytes:

self._current_position += len_content
return resp.content


class ContentRange(NamedTuple):
"""Represents the data in an HTTP Content-Range header."""

range_start: int | None
"""First byte of the zero-indexed, inclusive range returned by this
response. `None` if the range was not available in the header.
"""
range_end: int | None
"""Last byte of the zero-indexed, inclusive range returned by this
response. `None` if the range was not available in the header.
"""
total: int | None
"""Total size of the file in bytes. `None` if the file size was not
available in the header.
"""


def parse_content_range_header(header: str) -> ContentRange:
"""Parse an HTTP 'Content-Range' header.

Parameters
----------
header : `str`
Value of an HTTP Content-Range header to be parsed.

Returns
-------
content_range : `ContentRange`
The byte range included in the response and the total file size.

Raises
------
ValueError
If the header was not in the expected format.
"""
# There are three possible formats for Content-Range. All of them start
# with optional whitespace and a unit, which for our purposes should always
# be "bytes".
prefix = r"^\s*bytes\s+"

# Content-Range: <unit> <range-start>-<range-end>/<size>
if (case1 := re.match(prefix + r"(\d+)-(\d+)/(\d+)", header)) is not None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hopefully three separate regexes won't turn into a hotspot.

return ContentRange(
range_start=int(case1.group(1)), range_end=int(case1.group(2)), total=int(case1.group(3))
)

# Content-Range: <unit> <range-start>-<range-end>/*
if (case2 := re.match(prefix + r"(\d+)-(\d+)/\*", header)) is not None:
return ContentRange(range_start=int(case2.group(1)), range_end=int(case2.group(2)), total=None)

# Content-Range: <unit> */<size>
if (case3 := re.match(prefix + r"\*/(\d+)", header)) is not None:
return ContentRange(range_start=None, range_end=None, total=int(case3.group(1)))

raise ValueError(f"Content-Range header in unexpected format: '{header}'")
74 changes: 66 additions & 8 deletions python/lsst/resources/http.py
Expand Up @@ -34,6 +34,8 @@
except ImportError:
import xml.etree.ElementTree as eTree

from urllib.parse import parse_qs

import requests
from astropy import units as u
from lsst.utils.timer import time_this
Expand All @@ -42,7 +44,7 @@
from urllib3.util.retry import Retry

from ._resourceHandles import ResourceHandleProtocol
from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle
from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle, parse_content_range_header
from ._resourcePath import ResourcePath

if TYPE_CHECKING:
Expand Down Expand Up @@ -792,10 +794,8 @@
# request, even if the behavior for such a request against a
# directory is not specified, so it depends on the server
# implementation.
resp = self.metadata_session.head(
self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
)
return resp.status_code == requests.codes.ok # 200
resp = self._head_non_webdav_url()
return self._is_successful_non_webdav_head_request(resp)

# The remote endpoint is a webDAV server: send a PROPFIND request
# to determine if it exists.
Expand All @@ -814,16 +814,31 @@
if not self.is_webdav_endpoint:
# The remote is a plain HTTP server. Send a HEAD request to
# retrieve the size of the resource.
resp = self.metadata_session.head(
self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
)
resp = self._head_non_webdav_url()
if resp.status_code == requests.codes.ok: # 200
if "Content-Length" in resp.headers:
return int(resp.headers["Content-Length"])
else:
raise ValueError(
f"Response to HEAD request to {self} does not contain 'Content-Length' header"
)
elif resp.status_code == requests.codes.partial_content:
# 206, returned from a GET request with a Range header (used to
# emulate HEAD for presigned S3 URLs). In this case
# Content-Length is the length of the Range and not the full
# length of the file, so we have to parse Content-Range
# instead.
content_range_header = resp.headers.get("Content-Range")
if content_range_header is None:
raise ValueError(

Check warning on line 833 in python/lsst/resources/http.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/resources/http.py#L833

Added line #L833 was not covered by tests
f"Response to GET request to {self} did not contain 'Content-Range' header"
)
content_range = parse_content_range_header(content_range_header)
size = content_range.total
if size is None:
raise ValueError(f"Content-Range header for {self} did not include a total file size")

Check warning on line 839 in python/lsst/resources/http.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/resources/http.py#L839

Added line #L839 was not covered by tests
return size

elif resp.status_code == requests.codes.not_found:
raise FileNotFoundError(
f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
Expand Down Expand Up @@ -852,6 +867,49 @@
f"Resource {self} does not exist, status: {resp.status_code} {resp.reason}"
)

def _head_non_webdav_url(self) -> requests.Response:
"""Return a response from a HTTP HEAD request for a non-WebDAV HTTP
URL.

Emulates HEAD using a 0-byte GET for presigned S3 URLs.
"""
if self._looks_like_presigned_s3_url():
# Presigned S3 URLs are signed for a single method only, so you
# can't call HEAD on a URL signed for GET. However, S3 does
# support Range requests, so you can ask for a 0-byte range with
# GET for a similar effect to HEAD.
#
# Note that some headers differ between a true HEAD request and the
# response returned by this GET, e.g. Content-Length will always be
# 0, and the status code is 206 instead of 200.
return self.metadata_session.get(
self.geturl(),
timeout=self._config.timeout,
allow_redirects=True,
stream=False,
headers={"Range": "bytes=0-0"},
)
else:
return self.metadata_session.head(

Check warning on line 893 in python/lsst/resources/http.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/resources/http.py#L893

Added line #L893 was not covered by tests
self.geturl(), timeout=self._config.timeout, allow_redirects=True, stream=False
)

def _is_successful_non_webdav_head_request(self, resp: requests.Response) -> bool:
"""Return `True` if the status code in the response indicates a
successful HEAD or GET request.
"""
return resp.status_code in (
requests.codes.ok, # 200, from a normal HEAD or GET request
requests.codes.partial_content, # 206, returned from a GET request with a Range header.
)

def _looks_like_presigned_s3_url(self) -> bool:
"""Return `True` if this ResourcePath's URL is likely to be a presigned
S3 URL.
"""
query_params = parse_qs(self._uri.query)
return "Signature" in query_params and "Expires" in query_params

def mkdir(self) -> None:
"""Create the directory resource if it does not already exist."""
# Creating directories is only available on WebDAV back ends.
Expand Down
46 changes: 45 additions & 1 deletion tests/test_http.py
Expand Up @@ -36,7 +36,10 @@
import requests
import responses
from lsst.resources import ResourcePath
from lsst.resources._resourceHandles._httpResourceHandle import HttpReadResourceHandle
from lsst.resources._resourceHandles._httpResourceHandle import (
HttpReadResourceHandle,
parse_content_range_header,
)
from lsst.resources.http import (
BearerTokenAuth,
HttpResourcePathConfig,
Expand Down Expand Up @@ -880,5 +883,46 @@ def test_sessions(self):
self.assertEqual(session, store.get(ResourcePath(u)))


class TestContentRange(unittest.TestCase):
"""Test parsing of Content-Range header."""

def test_full_data(self):
parsed = parse_content_range_header("bytes 123-2555/12345")
self.assertEqual(parsed.range_start, 123)
self.assertEqual(parsed.range_end, 2555)
self.assertEqual(parsed.total, 12345)

parsed = parse_content_range_header(" bytes 0-0/5 ")
self.assertEqual(parsed.range_start, 0)
self.assertEqual(parsed.range_end, 0)
self.assertEqual(parsed.total, 5)

def test_empty_total(self):
parsed = parse_content_range_header("bytes 123-2555/*")
self.assertEqual(parsed.range_start, 123)
self.assertEqual(parsed.range_end, 2555)
self.assertIsNone(parsed.total)

parsed = parse_content_range_header(" bytes 0-0/* ")
self.assertEqual(parsed.range_start, 0)
self.assertEqual(parsed.range_end, 0)
self.assertIsNone(parsed.total)

def test_empty_range(self):
parsed = parse_content_range_header("bytes */12345")
self.assertIsNone(parsed.range_start)
self.assertIsNone(parsed.range_end)
self.assertEqual(parsed.total, 12345)

parsed = parse_content_range_header(" bytes */5 ")
self.assertIsNone(parsed.range_start)
self.assertIsNone(parsed.range_end)
self.assertEqual(parsed.total, 5)

def test_invalid_input(self):
with self.assertRaises(ValueError):
parse_content_range_header("pages 0-10/12")


if __name__ == "__main__":
unittest.main()
12 changes: 11 additions & 1 deletion tests/test_s3.py
Expand Up @@ -156,8 +156,18 @@ def test_url_signing(self):
# this test
test_data = b"test123"
ResourcePath(put_url).write(test_data)
retrieved = ResourcePath(get_url).read()
get_path = ResourcePath(get_url)
retrieved = get_path.read()
self.assertEqual(retrieved, test_data)
self.assertTrue(get_path.exists())
self.assertEqual(get_path.size(), len(test_data))

def test_nonexistent_presigned_url(self):
s3_path = self.root_uri.join("this-is-a-missing-file.txt")
get_url = s3_path.generate_presigned_get_url(expiration_time_seconds=3600)
# Check the HttpResourcePath implementation for presigned S3 urls.
# Nothing has been uploaded to this URL, so it shouldn't exist.
self.assertFalse(ResourcePath(get_url).exists())

def _check_presigned_url(self, url: str, expiration_time_seconds: int):
parsed = urlparse(url)
Expand Down