Skip to content

Commit

Permalink
Send S3 presigned URLs to the client
Browse files Browse the repository at this point in the history
In the currently planned use case for Butler client/server, the server uses S3 for storing artifacts and gives clients access to them by presigning URLs.
  • Loading branch information
dhirving committed Dec 5, 2023
1 parent 4158d79 commit 59d1e8b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 9 deletions.
11 changes: 10 additions & 1 deletion python/lsst/daf/butler/datastores/fileDatastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -1998,10 +1998,19 @@ def get(
def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload:
# Docstring inherited

# 8 hours. Chosen somewhat arbitrarily -- this is long enough that the
# client should have time to download a large file with retries if
# needed, but short enough that it will become obvious quickly that
# these URLs expire
url_expiration_time_seconds = 8 * 60 * 60

def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo:
location, file_info = info
return FileDatastoreGetPayloadFileInfo(
url=location.uri.geturl(), datastoreRecords=file_info.to_simple()
url=location.uri.generate_presigned_get_url(
expiration_time_seconds=url_expiration_time_seconds
),
datastoreRecords=file_info.to_simple(),
)

return FileDatastoreGetPayload(
Expand Down
16 changes: 8 additions & 8 deletions python/lsst/daf/butler/datastores/fileDatastoreClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@
generate_datastore_get_information,
get_dataset_as_python_object_from_get_info,
)
from pydantic import AnyHttpUrl


class FileDatastoreGetPayloadFileInfo(_BaseModelCompat):
"""Information required to read a single file stored in `FileDatastore`"""

# TODO DM-41879: Allowing arbitrary URLs here is a severe security issue,
# since it allows the server to trick the client into fetching data from
# any file on its local filesystem or from remote storage using credentials
# laying around in the environment. This should be restricted to only
# HTTP, but we don't yet have a means of mocking out HTTP gets in tests.
url: str
"""An absolute URL that can be used to read the file"""
# This is intentionally restricted to HTTP for security reasons. Allowing
# arbitrary URLs here would allow the server to trick the client into
# fetching data from any file on its local filesystem or from remote
# storage using credentials laying around in the environment.
url: AnyHttpUrl
"""An HTTP URL that can be used to read the file"""

datastoreRecords: SerializedStoredFileInfo
"""`FileDatastore` metadata records for this file"""
Expand Down Expand Up @@ -75,7 +75,7 @@ def get_dataset_as_python_object(
The retrieved artifact, converted to a Python object
"""
fileLocations: list[DatasetLocationInformation] = [
(Location(None, file_info.url), StoredFileInfo.from_simple(file_info.datastoreRecords))
(Location(None, str(file_info.url)), StoredFileInfo.from_simple(file_info.datastoreRecords))
for file_info in payload.file_info
]

Expand Down

0 comments on commit 59d1e8b

Please sign in to comment.