Skip to content

Commit

Permalink
Send S3 presigned URLs to the client
Browse files Browse the repository at this point in the history
In the currently planned use case for Butler client/server, the server uses S3 for storing artifacts and gives clients access to them by presigning URLs.
  • Loading branch information
dhirving committed Dec 8, 2023
1 parent 925c20b commit e6639aa
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 10 deletions.
15 changes: 14 additions & 1 deletion python/lsst/daf/butler/datastores/fileDatastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -1998,10 +1998,23 @@ def get(
def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload:
# Docstring inherited

# 1 hour. Chosen somewhat arbitrarily -- this is long enough that the
# client should have time to download a large file with retries if
# needed, but short enough that it will become obvious quickly that
# these URLs expire.
# From a strictly technical standpoint there is no reason this
# shouldn't be a day or more, but there seems to be a political issue
# where people think there is a risk of end users posting presigned
# URLs for people without access rights to download.
url_expiration_time_seconds = 1 * 60 * 60

Check warning on line 2009 in python/lsst/daf/butler/datastores/fileDatastore.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/datastores/fileDatastore.py#L2009

Added line #L2009 was not covered by tests

def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo:
location, file_info = info
return FileDatastoreGetPayloadFileInfo(
url=location.uri.geturl(), datastoreRecords=file_info.to_simple()
url=location.uri.generate_presigned_get_url(
expiration_time_seconds=url_expiration_time_seconds
),
datastoreRecords=file_info.to_simple(),
)

return FileDatastoreGetPayload(
Expand Down
16 changes: 8 additions & 8 deletions python/lsst/daf/butler/datastores/fileDatastoreClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@
generate_datastore_get_information,
get_dataset_as_python_object_from_get_info,
)
from pydantic import AnyHttpUrl


class FileDatastoreGetPayloadFileInfo(_BaseModelCompat):
"""Information required to read a single file stored in `FileDatastore`"""

# TODO DM-41879: Allowing arbitrary URLs here is a severe security issue,
# since it allows the server to trick the client into fetching data from
# any file on its local filesystem or from remote storage using credentials
# laying around in the environment. This should be restricted to only
# HTTP, but we don't yet have a means of mocking out HTTP gets in tests.
url: str
"""An absolute URL that can be used to read the file"""
# This is intentionally restricted to HTTP for security reasons. Allowing
# arbitrary URLs here would allow the server to trick the client into
# fetching data from any file on its local filesystem or from remote
# storage using credentials laying around in the environment.
url: AnyHttpUrl
"""An HTTP URL that can be used to read the file"""

datastoreRecords: SerializedStoredFileInfo
"""`FileDatastore` metadata records for this file"""
Expand Down Expand Up @@ -76,7 +76,7 @@ def get_dataset_as_python_object(
The retrieved artifact, converted to a Python object
"""
fileLocations: list[DatasetLocationInformation] = [
(Location(None, file_info.url), StoredFileInfo.from_simple(file_info.datastoreRecords))
(Location(None, str(file_info.url)), StoredFileInfo.from_simple(file_info.datastoreRecords))
for file_info in payload.file_info
]

Expand Down
6 changes: 6 additions & 0 deletions python/lsst/daf/butler/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,12 @@ def addDataset(
datasetType : ``DatasetType``, optional
The dataset type of the added dataset. If `None`, will use the
default dataset type.
Returns
-------
datasetRef : `DatasetRef`
A reference to the added dataset.
"""
if run:
self.butler.registry.registerCollection(run, type=CollectionType.RUN)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def test_get(self):
def _create_corrupted_dataset(repo: MetricTestRepo) -> DatasetRef:
run = "corrupted-run"
ref = repo.addDataset({"instrument": "DummyCamComp", "visit": 423}, run=run)
uris = repo.butler.getURIs(ref, run=run)
uris = repo.butler.getURIs(ref)
oneOfTheComponents = list(uris.componentURIs.values())[0]
oneOfTheComponents.write("corrupted data")
return ref

Check warning on line 251 in tests/test_server.py

View check run for this annotation

Codecov / codecov/patch

tests/test_server.py#L246-L251

Added lines #L246 - L251 were not covered by tests
Expand Down

0 comments on commit e6639aa

Please sign in to comment.