Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-43845: Implement default data ID for RemoteButler #1003

Merged
merged 4 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 18 additions & 0 deletions python/lsst/daf/butler/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@
"CalibrationLookupError",
"CollectionCycleError",
"CollectionTypeError",
"DataIdValueError",
"DatasetNotFoundError",
"DimensionNameError",
"DatasetTypeNotSupportedError",
"EmptyQueryResultError",
"InconsistentDataIdError",
"InvalidQueryError",
"MissingDatasetTypeError",
"MissingCollectionError",
Expand Down Expand Up @@ -96,6 +98,12 @@ class CollectionTypeError(CollectionError, ButlerUserError):
error_type = "collection_type"


class DataIdValueError(DataIdError, ButlerUserError):
"""Exception raised when a value specified in a data ID does not exist."""

error_type = "data_id_value"


class DatasetNotFoundError(LookupError, ButlerUserError):
"""The requested dataset could not be found."""

Expand All @@ -116,6 +124,14 @@ class DimensionValueError(ValueError, ButlerUserError):
error_type = "dimension_value"


class InconsistentDataIdError(DataIdError, ButlerUserError):
"""Exception raised when a data ID contains contradictory key-value pairs,
according to dimension relationships.
"""

error_type = "inconsistent_data_id"


class InvalidQueryError(ButlerUserError):
"""Exception raised when a query is not valid."""

Expand Down Expand Up @@ -185,7 +201,9 @@ class UnknownButlerUserError(ButlerUserError):
CollectionTypeError,
DimensionNameError,
DimensionValueError,
DataIdValueError,
DatasetNotFoundError,
InconsistentDataIdError,
InvalidQueryError,
MissingCollectionError,
MissingDatasetTypeError,
Expand Down
4 changes: 2 additions & 2 deletions python/lsst/daf/butler/dimensions/_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def _createSimpleRecordSubclass(definition: DimensionElement) -> type[SpecificSe
if definition.temporal:
members["timespan"] = (Timespan | None, ...) # type: ignore
if definition.spatial:
members["region"] = (str, ...)
members["region"] = (str | None, ...) # type: ignore

# For the new derived class name need to convert to camel case.
# so "day_obs" -> "DayObs".
Expand Down Expand Up @@ -460,7 +460,7 @@ def from_simple(
# assembled.
mapping = {k: getattr(record_model, k) for k in definition.schema.names}

if "region" in mapping:
if mapping.get("region") is not None:
mapping["region"] = lsst.sphgeom.Region.decode(bytes.fromhex(mapping["region"]))
if "hash" in mapping:
mapping["hash"] = bytes.fromhex(mapping["hash"].decode())
Expand Down
2 changes: 2 additions & 0 deletions python/lsst/daf/butler/registry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
# used to be part of registry.
from .._exceptions import (
CollectionTypeError,
DataIdValueError,
DimensionNameError,
InconsistentDataIdError,
MissingCollectionError,
MissingDatasetTypeError,
)
Expand Down
18 changes: 16 additions & 2 deletions python/lsst/daf/butler/registry/_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,21 @@ def __init__(self, collections: Any = None, run: str | None = None, infer: bool
self._infer = infer
self._kwargs = kwargs

@staticmethod
def from_data_id(data_id: DataCoordinate) -> RegistryDefaults:
"""Create a RegistryDefaults object with a specified ``dataId`` value
and no default collections.

Parameters
----------
data_id : `DataCoordinate`
The default data ID value.
"""
defaults = RegistryDefaults(None, None, False)
defaults.dataId = data_id
defaults._finished = True
return defaults

def __repr__(self) -> str:
collections = f"collections={self.collections!r}" if self.collections else ""
run = f"run={self.run!r}" if self.run else ""
Expand Down Expand Up @@ -168,6 +183,5 @@ def finish(self, registry: Registry | SqlRegistry) -> None:
dimensions are ever included in defaults.

This attribute may not be accessed before the defaults struct is
attached to a `Registry` instance. It always satisfies both ``hasFull``
and ``hasRecords``.
attached to a `Registry` instance. It always satisfies ``hasFull``.
"""
14 changes: 1 addition & 13 deletions python/lsst/daf/butler/registry/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@
"ArgumentError",
"CollectionExpressionError",
"ConflictingDefinitionError",
"DataIdValueError",
"DatasetTypeExpressionError",
"InconsistentDataIdError",
"MissingSpatialOverlapError",
"NoDefaultCollectionError",
"OrphanedRecordError",
Expand All @@ -42,7 +40,7 @@
"UserExpressionSyntaxError",
)

from .._exceptions_legacy import CollectionError, DataIdError, RegistryError
from .._exceptions_legacy import CollectionError, RegistryError


class ArgumentError(RegistryError):
Expand All @@ -53,16 +51,6 @@ class DatasetTypeExpressionError(RegistryError):
"""Exception raised for an incorrect dataset type expression."""


class DataIdValueError(DataIdError):
"""Exception raised when a value specified in a data ID does not exist."""


class InconsistentDataIdError(DataIdError):
"""Exception raised when a data ID contains contradictory key-value pairs,
according to dimension relationships.
"""


class CollectionExpressionError(CollectionError):
"""Exception raised for an incorrect collection expression."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
from ..._column_categorization import ColumnCategorization
from ..._column_tags import DimensionKeyColumnTag, DimensionRecordColumnTag
from ..._dataset_type import DatasetType
from ..._exceptions import DataIdValueError
from ...dimensions import DimensionGroup, DimensionRecordSet, DimensionUniverse
from ...dimensions.record_cache import DimensionRecordCache
from .._collection_type import CollectionType
from .._exceptions import DataIdValueError
from ..interfaces import CollectionRecord, Database
from ._query_backend import QueryBackend
from ._sql_query_context import SqlQueryContext
Expand Down
11 changes: 7 additions & 4 deletions python/lsst/daf/butler/registry/sql_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@
from .._dataset_association import DatasetAssociation
from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from .._dataset_type import DatasetType
from .._exceptions import CalibrationLookupError, DimensionNameError
from .._exceptions import (
CalibrationLookupError,
DataIdValueError,
DimensionNameError,
InconsistentDataIdError,
)
from .._named import NamedKeyMapping, NameLookupMapping
from .._storage_class import StorageClassFactory
from .._timespan import Timespan
Expand All @@ -72,9 +77,7 @@
CollectionType,
CollectionTypeError,
ConflictingDefinitionError,
DataIdValueError,
DatasetTypeError,
InconsistentDataIdError,
MissingDatasetTypeError,
NoDefaultCollectionError,
OrphanedRecordError,
Expand Down Expand Up @@ -1589,7 +1592,7 @@ def expandDataId(
)
if element.defines_relationships:
raise InconsistentDataIdError(
f"Could not fetch record for element {element_name} via keys {keys}, ",
f"Could not fetch record for element {element_name} via keys {keys}, "
"but it is marked as defining relationships; this means one or more dimensions are "
"have inconsistent values.",
)
Expand Down
10 changes: 7 additions & 3 deletions python/lsst/daf/butler/registry/tests/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,13 @@
from ..._dataset_association import DatasetAssociation
from ..._dataset_ref import DatasetIdFactory, DatasetIdGenEnum, DatasetRef
from ..._dataset_type import DatasetType
from ..._exceptions import CollectionTypeError, MissingCollectionError, MissingDatasetTypeError
from ..._exceptions import (
CollectionTypeError,
DataIdValueError,
InconsistentDataIdError,
MissingCollectionError,
MissingDatasetTypeError,
)
from ..._exceptions_legacy import DatasetTypeError
from ..._storage_class import StorageClass
from ..._timespan import Timespan
Expand All @@ -70,9 +76,7 @@
ArgumentError,
CollectionError,
ConflictingDefinitionError,
DataIdValueError,
DatasetTypeExpressionError,
InconsistentDataIdError,
NoDefaultCollectionError,
OrphanedRecordError,
)
Expand Down
25 changes: 21 additions & 4 deletions python/lsst/daf/butler/remote_butler/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
from ..registry.queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults
from ..remote_butler import RemoteButler
from ._collection_args import convert_collection_arg_to_glob_string_list
from .server_models import QueryCollectionsRequestModel
from ._http_connection import RemoteButlerHttpConnection, parse_model
from .server_models import ExpandDataIdRequestModel, ExpandDataIdResponseModel, QueryCollectionsRequestModel


class RemoteButlerRegistry(Registry):
Expand All @@ -71,10 +72,13 @@ class RemoteButlerRegistry(Registry):
----------
butler : `RemoteButler`
Butler instance to which this registry delegates operations.
connection : `RemoteButlerHttpConnection`
HTTP connection to Butler server for looking up data.
"""

def __init__(self, butler: RemoteButler):
def __init__(self, butler: RemoteButler, connection: RemoteButlerHttpConnection):
self._butler = butler
self._connection = connection

def isWriteable(self) -> bool:
return self._butler.isWriteable()
Expand Down Expand Up @@ -244,8 +248,21 @@ def expandDataId(
withDefaults: bool = True,
**kwargs: Any,
) -> DataCoordinate:
# TODO DM-43845: Replace this stub with a real implementation.
return DataCoordinate.makeEmpty(self.dimensions)
standardized = DataCoordinate.standardize(
dataId,
graph=graph,
dimensions=dimensions,
universe=self.dimensions,
defaults=self.defaults.dataId if withDefaults else None,
**kwargs,
)
if standardized.hasRecords():
return standardized

request = ExpandDataIdRequestModel(data_id=standardized.to_simple().dataId)
response = self._connection.post("expand_data_id", request)
model = parse_model(response, ExpandDataIdResponseModel)
return DataCoordinate.from_simple(model.data_coordinate, self.dimensions)

def insertDimensionData(
self,
Expand Down
18 changes: 16 additions & 2 deletions python/lsst/daf/butler/remote_butler/_remote_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
from .._storage_class import StorageClass, StorageClassFactory
from .._utilities.locked_object import LockedObject
from ..datastore import DatasetRefURIs
from ..dimensions import DataIdValue, DimensionConfig, DimensionUniverse
from ..dimensions import DataIdValue, DimensionConfig, DimensionUniverse, SerializedDataId
from ..queries import Query
from ..registry import (
CollectionArgType,
Expand Down Expand Up @@ -136,7 +136,7 @@ def __new__(
# Avoid a circular import by deferring this import.
from ._registry import RemoteButlerRegistry

self._registry = RemoteButlerRegistry(self)
self._registry = RemoteButlerRegistry(self, self._connection)

self._registry_defaults = RegistryDefaults(
options.collections, options.run, options.inferDefaults, **options.kwargs
Expand Down Expand Up @@ -274,6 +274,7 @@ def _get_file_info(
dataset_type_name=normalize_dataset_type_name(datasetRefOrType),
collections=self._normalize_collections(collections),
data_id=simplify_dataId(dataId, kwargs),
default_data_id=self._serialize_default_data_id(),
timespan=timespan,
)
response = self._connection.post("get_file_by_data_id", request)
Expand Down Expand Up @@ -361,6 +362,7 @@ def find_dataset(

query = FindDatasetRequestModel(
data_id=simplify_dataId(data_id, kwargs),
default_data_id=self._serialize_default_data_id(),
collections=self._normalize_collections(collections),
timespan=timespan,
dimension_records=dimension_records,
Expand Down Expand Up @@ -599,6 +601,18 @@ def _query_collections(self, query: QueryCollectionsRequestModel) -> QueryCollec
response = self._connection.post("query_collections", query)
return parse_model(response, QueryCollectionsResponseModel)

def _serialize_default_data_id(self) -> SerializedDataId:
"""Convert the default data ID to a serializable format."""
# In an ideal world, the default data ID would just get combined with
# the rest of the data ID on the client side instead of being sent
# separately to the server. Unfortunately, that requires knowledge of
# the DatasetType's dimensions which we don't always have available on
# the client. Data ID values can be specified indirectly by "implied"
# dimensions, but knowing what things are implied depends on what the
# required dimensions are.

return self._registry_defaults.dataId.to_simple(minimal=True).dataId


def _to_file_payload(get_file_response: GetFileResponseModel) -> FileDatastoreGetPayload:
if get_file_response.artifact is None:
Expand Down
23 changes: 23 additions & 0 deletions python/lsst/daf/butler/remote_butler/server/handlers/_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
from lsst.daf.butler import Butler, CollectionType, DatasetRef, SerializedDatasetRef, SerializedDatasetType
from lsst.daf.butler.registry.interfaces import ChainedCollectionRecord
from lsst.daf.butler.remote_butler.server_models import (
ExpandDataIdRequestModel,
ExpandDataIdResponseModel,
FindDatasetRequestModel,
FindDatasetResponseModel,
GetCollectionInfoResponseModel,
Expand All @@ -47,6 +49,8 @@
)

from ...._exceptions import DatasetNotFoundError
from ....dimensions import DataCoordinate, SerializedDataId
from ....registry import RegistryDefaults
from .._dependencies import factory_dependency
from .._factory import Factory

Expand Down Expand Up @@ -139,6 +143,7 @@ def find_dataset(
factory: Factory = Depends(factory_dependency),
) -> FindDatasetResponseModel:
butler = factory.create_butler()
_set_default_data_id(butler, query.default_data_id)
ref = butler.find_dataset(
dataset_type,
query.data_id,
Expand All @@ -151,6 +156,16 @@ def find_dataset(
return FindDatasetResponseModel(dataset_ref=serialized_ref)


@external_router.post("/v1/expand_data_id", summary="Return full dimension records for a given data ID")
def expand_data_id(
request: ExpandDataIdRequestModel,
factory: Factory = Depends(factory_dependency),
) -> ExpandDataIdResponseModel:
butler = factory.create_butler()
coordinate = butler.registry.expandDataId(request.data_id)
return ExpandDataIdResponseModel(data_coordinate=coordinate.to_simple())


@external_router.get(
"/v1/get_file/{dataset_id}",
summary="Lookup via DatasetId (UUID) the information needed to download and use the files associated"
Expand All @@ -177,6 +192,7 @@ def get_file_by_data_id(
factory: Factory = Depends(factory_dependency),
) -> GetFileResponseModel:
butler = factory.create_butler()
_set_default_data_id(butler, request.default_data_id)
ref = butler._findDatasetRef(
datasetRefOrType=request.dataset_type_name,
dataId=request.data_id,
Expand Down Expand Up @@ -241,3 +257,10 @@ def query_collections(
includeChains=request.include_chains,
)
return QueryCollectionsResponseModel(collections=collections)


def _set_default_data_id(butler: Butler, data_id: SerializedDataId) -> None:
"""Set the default data ID values used for lookups in the given Butler."""
butler.registry.defaults = RegistryDefaults.from_data_id(
DataCoordinate.standardize(data_id, universe=butler.dimensions)
)