Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-42187: Add RemoteButler.getURIs #924

Merged
merged 3 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
run: sudo apt-get install graphviz

- name: Install documenteer
run: pip install 'documenteer[pipelines]>=0.8'
run: pip install 'documenteer[pipelines]>=0.8,<1.0'

- name: Build documentation
working-directory: ./doc
Expand Down
12 changes: 10 additions & 2 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,6 @@ def getURIs(
"""
raise NotImplementedError()

@abstractmethod
def getURI(
self,
datasetRefOrType: DatasetRef | DatasetType | str,
Expand Down Expand Up @@ -808,7 +807,16 @@ def getURI(
Raised if a URI is requested for a dataset that consists of
multiple artifacts.
"""
raise NotImplementedError()
primary, components = self.getURIs(
datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
)

if primary is None or components:
raise RuntimeError(
f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
"Use Butler.getURIs() instead."
)
return primary

@abstractmethod
def get_dataset_type(self, name: str) -> DatasetType:
Expand Down
72 changes: 0 additions & 72 deletions python/lsst/daf/butler/direct_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,78 +1259,6 @@ def getURIs(
)
return self._datastore.getURIs(ref, predict)

def getURI(
self,
datasetRefOrType: DatasetRef | DatasetType | str,
/,
dataId: DataId | None = None,
*,
predict: bool = False,
collections: Any = None,
run: str | None = None,
**kwargs: Any,
) -> ResourcePath:
"""Return the URI to the Dataset.

Parameters
----------
datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
When `DatasetRef` the `dataId` should be `None`.
Otherwise the `DatasetType` or name thereof.
dataId : `dict` or `DataCoordinate`
A `dict` of `Dimension` link name, value pairs that label the
`DatasetRef` within a Collection. When `None`, a `DatasetRef`
should be provided as the first argument.
predict : `bool`
If `True`, allow URIs to be returned of datasets that have not
been written.
collections : Any, optional
Collections to be searched, overriding ``self.collections``.
Can be any of the types supported by the ``collections`` argument
to butler construction.
run : `str`, optional
Run to use for predictions, overriding ``self.run``.
**kwargs
Additional keyword arguments used to augment or construct a
`DataCoordinate`. See `DataCoordinate.standardize`
parameters.

Returns
-------
uri : `lsst.resources.ResourcePath`
URI pointing to the Dataset within the datastore. If the
Dataset does not exist in the datastore, and if ``predict`` is
`True`, the URI will be a prediction and will include a URI
fragment "#predicted".
If the datastore does not have entities that relate well
to the concept of a URI the returned URI string will be
descriptive. The returned URI is not guaranteed to be obtainable.

Raises
------
LookupError
A URI has been requested for a dataset that does not exist and
guessing is not allowed.
ValueError
Raised if a resolved `DatasetRef` was passed as an input, but it
differs from the one found in the registry.
TypeError
Raised if no collections were provided.
RuntimeError
Raised if a URI is requested for a dataset that consists of
multiple artifacts.
"""
primary, components = self.getURIs(
datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
)

if primary is None or components:
raise RuntimeError(
f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
"Use Butler.getURIs() instead."
)
return primary

def get_dataset_type(self, name: str) -> DatasetType:
return self._registry.getDatasetType(name)

Expand Down
91 changes: 55 additions & 36 deletions python/lsst/daf/butler/remote_butler/_remote_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, SerializedDatasetRef
from .._dataset_type import DatasetType, SerializedDatasetType
from .._storage_class import StorageClass
from ..datastore import DatasetRefURIs
from ..dimensions import DataCoordinate, DataIdValue, DimensionConfig, DimensionUniverse, SerializedDataId
from ..registry import MissingDatasetTypeError, NoDefaultCollectionError, RegistryDefaults
from ..registry.wildcards import CollectionWildcard
Expand All @@ -68,13 +69,16 @@
from .._limited_butler import LimitedButler
from .._query import Query
from .._timespan import Timespan
from ..datastore import DatasetRefURIs
from ..dimensions import DataId, DimensionGroup, DimensionRecord
from ..registry import CollectionArgType, Registry
from ..transfers import RepoExportContext


_AnyPydanticModel = TypeVar("_AnyPydanticModel", bound=_BaseModelCompat)
"""Generic type variable that accepts any Pydantic model class."""
_InputCollectionList = str | Sequence[str] | None
"""The possible types of the ``collections`` parameter of most Butler methods.
"""


class RemoteButler(Butler):
Expand Down Expand Up @@ -269,26 +273,7 @@
**kwargs: Any,
) -> Any:
# Docstring inherited.
if isinstance(datasetRefOrType, DatasetRef):
dataset_id = datasetRefOrType.id
response = self._get(f"get_file/{dataset_id}")
if response.status_code == 404:
raise LookupError(f"Dataset not found: {datasetRefOrType}")
else:
request = GetFileByDataIdRequestModel(
dataset_type_name=self._normalize_dataset_type_name(datasetRefOrType),
collections=self._normalize_collections(collections),
data_id=self._simplify_dataId(dataId, kwargs),
)
response = self._post("get_file_by_data_id", request)
if response.status_code == 404:
raise LookupError(
f"Dataset not found with DataId: {dataId} DatasetType: {datasetRefOrType}"
f" collections: {collections}"
)

response.raise_for_status()
model = self._parse_model(response, GetFileResponseModel)
model = self._get_file_info(datasetRefOrType, dataId, collections, kwargs)

# If the caller provided a DatasetRef or DatasetType, they may have
# overridden the storage class on it. We need to respect this, if they
Expand All @@ -313,21 +298,38 @@
component=componentOverride,
)

def getURIs(
def _get_file_info(
self,
datasetRefOrType: DatasetRef | DatasetType | str,
/,
dataId: DataId | None = None,
*,
predict: bool = False,
collections: Any = None,
run: str | None = None,
**kwargs: Any,
) -> DatasetRefURIs:
# Docstring inherited.
raise NotImplementedError()
dataId: DataId | None,
collections: _InputCollectionList,
kwargs: dict[str, DataIdValue],
) -> GetFileResponseModel:
"""Send a request to the server for the file URLs and metadata
associated with a dataset.
"""
if isinstance(datasetRefOrType, DatasetRef):
dataset_id = datasetRefOrType.id
response = self._get(f"get_file/{dataset_id}")
if response.status_code == 404:
raise LookupError(f"Dataset not found: {datasetRefOrType}")
else:
request = GetFileByDataIdRequestModel(
dataset_type_name=self._normalize_dataset_type_name(datasetRefOrType),
collections=self._normalize_collections(collections),
data_id=self._simplify_dataId(dataId, kwargs),
)
response = self._post("get_file_by_data_id", request)
if response.status_code == 404:
raise LookupError(
f"Dataset not found with DataId: {dataId} DatasetType: {datasetRefOrType}"
f" collections: {collections}"
)

def getURI(
response.raise_for_status()
return self._parse_model(response, GetFileResponseModel)

def getURIs(
self,
datasetRefOrType: DatasetRef | DatasetType | str,
/,
Expand All @@ -337,9 +339,26 @@
collections: Any = None,
run: str | None = None,
**kwargs: Any,
) -> ResourcePath:
) -> DatasetRefURIs:
# Docstring inherited.
raise NotImplementedError()
if predict or run:
raise NotImplementedError("Predict mode is not supported by RemoteButler")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to discuss whether we can make this work. Obviously a signed URL to a resource that doesn't exist is not all that helpful but people like to know what the filename might look like. We can tell them that in theory -- maybe for predict mode we return the unsigned s3 URI? Can punt this to another ticket.


response = self._get_file_info(datasetRefOrType, dataId, collections, kwargs)
file_info = response.file_info
if len(file_info) == 1:
return DatasetRefURIs(primaryURI=ResourcePath(str(file_info[0].url)))
else:
components = {}
for f in file_info:
component = f.datastoreRecords.component
if component is None:
raise ValueError(

Check warning on line 356 in python/lsst/daf/butler/remote_butler/_remote_butler.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_remote_butler.py#L356

Added line #L356 was not covered by tests
f"DatasetId {response.dataset_ref.id} has a component file"
" with no component name defined"
)
components[component] = ResourcePath(str(f.url))
return DatasetRefURIs(componentURIs=components)

def get_dataset_type(self, name: str) -> DatasetType:
# In future implementation this should directly access the cache
Expand Down Expand Up @@ -627,7 +646,7 @@
"""Deserialize a Pydantic model from the body of an HTTP response."""
return model.model_validate_json(response.content)

def _normalize_collections(self, collections: str | Sequence[str] | None) -> CollectionList:
def _normalize_collections(self, collections: _InputCollectionList) -> CollectionList:
"""Convert the ``collections`` parameter in the format used by Butler
methods to a standardized format for the REST API.
"""
Expand Down
63 changes: 59 additions & 4 deletions tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@
NoDefaultCollectionError,
StorageClassFactory,
)
from lsst.daf.butler.tests import DatastoreMock
from lsst.daf.butler.datastore import DatasetRefURIs
from lsst.daf.butler.tests import DatastoreMock, addDatasetType
from lsst.daf.butler.tests.utils import MetricsExample, MetricTestRepo, makeTestTempDir, removeTestTempDir
from lsst.resources import ResourcePath
from lsst.resources.http import HttpResourcePath

TESTDIR = os.path.abspath(os.path.dirname(__file__))
Expand Down Expand Up @@ -104,9 +106,11 @@ def setUpClass(cls):
configFile=os.path.join(TESTDIR, "config/basic/butler-s3store.yaml"),
forceConfigRoot=False,
)

# Add a file with corrupted data for testing error conditions
cls.dataset_with_corrupted_data = _create_corrupted_dataset(cls.repo)
# All of the datasets that come with MetricTestRepo are disassembled
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had forgotten that the MetricTestRepo defaults to storage class StructuredCompositeReadComp. Easy to change the default if we wanted to but also nice to know that getURIs should work.

# composites. Add a simple dataset for testing the common case.
cls.simple_dataset_ref = _create_simple_dataset(cls.repo.butler)

# Override the server's Butler initialization to point at our test repo
server_butler = Butler.from_config(cls.root, writeable=True)
Expand All @@ -131,8 +135,9 @@ def create_factory_dependency():
_make_test_client(app, raise_server_exceptions=False)
)

# Populate the test server. The DatastoreMock is required because the
# datasets referenced in these imports do not point at real files
# Populate the test server.
# The DatastoreMock is required because the datasets referenced in
# these imports do not point at real files.
DatastoreMock.apply(server_butler)
server_butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
server_butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "datasets.yaml"))
Expand Down Expand Up @@ -323,6 +328,50 @@ def check_sc_override(converted):
)
self.assertEqual(dataset_type_component_data, MetricTestRepo.METRICS_EXAMPLE_SUMMARY)

def test_getURIs_no_components(self):
# This dataset does not have components, and should return one URI.
def check_uri(uri: ResourcePath):
self.assertIsNotNone(uris.primaryURI)
self.assertEqual(uris.primaryURI.scheme, "https")
self.assertEqual(uris.primaryURI.read(), b"123")

uris = self.butler.getURIs(self.simple_dataset_ref)
self.assertEqual(len(uris.componentURIs), 0)
check_uri(uris.primaryURI)

check_uri(self.butler.getURI(self.simple_dataset_ref))

def test_getURIs_multiple_components(self):
# This dataset has multiple components, so we should get back multiple
# URIs.
dataset_type = "test_metric_comp"
data_id = {"instrument": "DummyCamComp", "visit": 423}
collections = "ingest/run"

def check_uris(uris: DatasetRefURIs):
self.assertIsNone(uris.primaryURI)
self.assertEqual(len(uris.componentURIs), 3)
path = uris.componentURIs["summary"]
self.assertEqual(path.scheme, "https")
data = path.read()
self.assertEqual(data, b"AM1: 5.2\nAM2: 30.6\n")

uris = self.butler.getURIs(dataset_type, dataId=data_id, collections=collections)
check_uris(uris)

# Calling getURI on a multi-file dataset raises an exception
with self.assertRaises(RuntimeError):
self.butler.getURI(dataset_type, dataId=data_id, collections=collections)

# getURIs does NOT respect component overrides on the DatasetRef,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is only one URI for any of the components if not disassembled. I imagine no-one has ever tried to request the URI of an individual component by passing in the ref but it seems like it should return the one URI of the component.

# instead returning the parent's URIs. Unclear if this is "correct"
# from a conceptual point of view, but this matches DirectButler
# behavior.
ref = self.butler.find_dataset(dataset_type, data_id=data_id, collections=collections)
componentRef = ref.makeComponentRef("summary")
componentUris = self.butler.getURIs(componentRef)
check_uris(componentUris)


def _create_corrupted_dataset(repo: MetricTestRepo) -> DatasetRef:
run = "corrupted-run"
Expand All @@ -333,5 +382,11 @@ def _create_corrupted_dataset(repo: MetricTestRepo) -> DatasetRef:
return ref


def _create_simple_dataset(butler: Butler) -> DatasetRef:
dataset_type = addDatasetType(butler, "test_int", {"instrument", "visit"}, "int")
ref = butler.put(123, dataset_type, dataId={"instrument": "DummyCamComp", "visit": 423})
return ref


if __name__ == "__main__":
unittest.main()