Merge pull request #736 from lsst/tickets/DM-36312

DM-36312: Deprecate support for component datasets in Registry
lsst · Oct 5, 2022 · a21ee8f · a21ee8f
2 parents 738e63c + c7f82e6
commit a21ee8f
Show file tree

Hide file tree

Showing 12 changed files with 164 additions and 92 deletions.
diff --git a/doc/changes/DM-36312.api.md b/doc/changes/DM-36312.api.md
@@ -0,0 +1 @@
+Deprecate support for components in `Registry.query*` methods, per RFC-879.
diff --git a/python/lsst/daf/butler/registries/sql.py b/python/lsst/daf/butler/registries/sql.py
@@ -418,7 +418,12 @@ def removeDatasetType(self, name: str) -> None:
 
     def getDatasetType(self, name: str) -> DatasetType:
         # Docstring inherited from lsst.daf.butler.registry.Registry
-        return self._managers.datasets[name].datasetType
+        parent_name, component = DatasetType.splitDatasetTypeName(name)
+        storage = self._managers.datasets[parent_name]
+        if component is None:
+            return storage.datasetType
+        else:
+            return storage.datasetType.makeComponentDatasetType(component)
 
     def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
         # Docstring inherited from lsst.daf.butler.registry.Registry
@@ -435,9 +440,10 @@ def findDataset(
     ) -> Optional[DatasetRef]:
         # Docstring inherited from lsst.daf.butler.registry.Registry
         if isinstance(datasetType, DatasetType):
-            storage = self._managers.datasets[datasetType.name]
+            parent_name, component = datasetType.nameAndComponent()
         else:
-            storage = self._managers.datasets[datasetType]
+            parent_name, component = DatasetType.splitDatasetTypeName(datasetType)
+        storage = self._managers.datasets[parent_name]
         dataId = DataCoordinate.standardize(
             dataId,
             graph=storage.datasetType.dimensions,
@@ -460,6 +466,8 @@ def findDataset(
                 continue
             result = storage.find(collectionRecord, dataId, timespan=timespan)
             if result is not None:
+                if component is not None:
+                    return result.makeComponentRef(component)
                 return result
 
         return None
@@ -929,6 +937,10 @@ def _standardize_query_dataset_args(
             parent datasets were not matched by the expression.
             Fully-specified component datasets (`str` or `DatasetType`
             instances) are always included.
+
+            Values other than `False` are deprecated, and only `False` will be
+            supported after v26.  After v27 this argument will be removed
+            entirely.
         mode : `str`, optional
             The way in which datasets are being used in this query; one of:
 
@@ -1026,7 +1038,7 @@ def queryDatasets(
                 check=check,
                 datasets=[parent_dataset_type],
             )
-            builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
+            builder = self._makeQueryBuilder(summary)
             # Add the dataset subquery to the query, telling the QueryBuilder
             # to include the rank of the selected collection in the results
             # only if we need to findFirst.  Note that if any of the

diff --git a/python/lsst/daf/butler/registry/_exceptions.py b/python/lsst/daf/butler/registry/_exceptions.py
@@ -67,7 +67,7 @@ class DatasetTypeError(RegistryError):
     """Exception raised for problems with dataset types."""
 
 
-class MissingDatasetTypeError(DatasetTypeError):
+class MissingDatasetTypeError(DatasetTypeError, KeyError):
     """Exception raised when a dataset type does not exist."""
 
 

diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py
@@ -639,8 +639,13 @@ def getDatasetType(self, name: str) -> DatasetType:
 
         Raises
         ------
-        KeyError
-            Requested named DatasetType could not be found in registry.
+        MissingDatasetTypeError
+            Raised if the requested dataset type has not been registered.
+
+        Notes
+        -----
+        This method handles component dataset types automatically, though most
+        other registry operations do not.
         """
         raise NotImplementedError()
 
@@ -712,7 +717,7 @@ def findDataset(
             ``self.defaults.collections`` is `None`.
         LookupError
             Raised if one or more data ID keys are missing.
-        KeyError
+        MissingDatasetTypeError
             Raised if the dataset type does not exist.
         MissingCollectionError
             Raised if any of ``collections`` does not exist in the registry.
@@ -728,6 +733,9 @@ def findDataset(
         reported consistently, regardless of the reason, and that adding
         additional collections that do not contain a match to the search path
         never changes the behavior.
+
+        This method handles component dataset types automatically, though most
+        other registry operations do not.
         """
         raise NotImplementedError()
 
@@ -1213,6 +1221,10 @@ def queryDatasetTypes(
             parent datasets were not matched by the expression.
             Fully-specified component datasets (`str` or `DatasetType`
             instances) are always included.
+
+            Values other than `False` are deprecated, and only `False` will be
+            supported after v26.  After v27 this argument will be removed
+            entirely.
         missing : `list` of `str`, optional
             String dataset type names that were explicitly given (i.e. not
             regular expression patterns) but not found will be appended to this
@@ -1348,6 +1360,10 @@ def queryDatasets(
             if their parent datasets were not matched by the expression.
             Fully-specified component datasets (`str` or `DatasetType`
             instances) are always included.
+
+            Values other than `False` are deprecated, and only `False` will be
+            supported after v26.  After v27 this argument will be removed
+            entirely.
         bind : `Mapping`, optional
             Mapping containing literal values that should be injected into the
             ``where`` expression, keyed by the identifiers they replace.
@@ -1459,6 +1475,10 @@ def queryDataIds(
             if their parent datasets were not matched by the expression.
             Fully-specified component datasets (`str` or `DatasetType`
             instances) are always included.
+
+            Values other than `False` are deprecated, and only `False` will be
+            supported after v26.  After v27 this argument will be removed
+            entirely.
         bind : `Mapping`, optional
             Mapping containing literal values that should be injected into the
             ``where`` expression, keyed by the identifiers they replace.
@@ -1547,6 +1567,10 @@ def queryDimensionRecords(
         components : `bool`, optional
             Whether to apply dataset expressions to components as well.
             See `queryDataIds` for more information.
+
+            Values other than `False` are deprecated, and only `False` will be
+            supported after v26.  After v27 this argument will be removed
+            entirely.
         bind : `Mapping`, optional
             Mapping containing literal values that should be injected into the
             ``where`` expression, keyed by the identifiers they replace.

diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py
@@ -5,7 +5,6 @@
     "ByDimensionsDatasetRecordStorageManagerUUID",
 )
 
-import copy
 import logging
 import warnings
 from collections import defaultdict
@@ -254,14 +253,7 @@ def remove(self, name: str) -> None:
 
     def find(self, name: str) -> DatasetRecordStorage | None:
         # Docstring inherited from DatasetRecordStorageManager.
-        compositeName, componentName = DatasetType.splitDatasetTypeName(name)
-        storage = self._byName.get(compositeName)
-        if storage is not None and componentName is not None:
-            componentStorage = copy.copy(storage)
-            componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName)
-            return componentStorage
-        else:
-            return storage
+        return self._byName.get(name)
 
     def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
         # Docstring inherited from DatasetRecordStorageManager.
@@ -342,8 +334,18 @@ def resolve_wildcard(
     ) -> dict[DatasetType, list[str | None]]:
         wildcard = DatasetTypeWildcard.from_expression(expression)
         result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
+        # This message can be transformed into an error on DM-36303 after v26,
+        # and the components argument here (and in all callers) can be removed
+        # entirely on DM-36457 after v27.
+        deprecation_message = (
+            "Querying for component datasets via Registry query methods is deprecated in favor of using "
+            "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
+            "after v26, and the components argument will be removed after v27."
+        )
         for name, dataset_type in wildcard.values.items():
             parent_name, component_name = DatasetType.splitDatasetTypeName(name)
+            if component_name is not None:
+                warnings.warn(deprecation_message, FutureWarning)
             if (found_storage := self.find(parent_name)) is not None:
                 found_parent = found_storage.datasetType
                 if component_name is not None:
@@ -366,6 +368,7 @@ def resolve_wildcard(
                 result[found_parent].add(component_name)
             elif missing is not None:
                 missing.append(name)
+        already_warned = False
         if wildcard.patterns is Ellipsis:
             if explicit_only:
                 raise TypeError(
@@ -378,6 +381,9 @@ def resolve_wildcard(
                         result[storage.datasetType].update(
                             storage.datasetType.storageClass.allComponents().keys()
                         )
+                        if storage.datasetType.storageClass.allComponents() and not already_warned:
+                            warnings.warn(deprecation_message, FutureWarning)
+                            already_warned = True
                     except KeyError as err:
                         _LOG.warning(
                             f"Could not load storage class {err} for {storage.datasetType.name}; "
@@ -414,6 +420,9 @@ def resolve_wildcard(
                             for p in wildcard.patterns
                         ):
                             result[storage.datasetType].add(component_name)
+                            if not already_warned:
+                                warnings.warn(deprecation_message, FutureWarning)
+                                already_warned = True
         return {k: list(v) for k, v in result.items()}
 
     def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:

diff --git a/python/lsst/daf/butler/registry/interfaces/_datasets.py b/python/lsst/daf/butler/registry/interfaces/_datasets.py
@@ -32,6 +32,7 @@
 import sqlalchemy.sql
 
 from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, SimpleQuery, Timespan, ddl
+from .._exceptions import MissingDatasetTypeError
 from ._versioning import VersionedExtension
 
 if TYPE_CHECKING:
@@ -596,7 +597,7 @@ def __getitem__(self, name: str) -> DatasetRecordStorage:
         """
         result = self.find(name)
         if result is None:
-            raise KeyError(f"Dataset type with name '{name}' not found.")
+            raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.")
         return result
 
     @abstractmethod
@@ -680,6 +681,10 @@ def resolve_wildcard(
             datasets were not matched by the expression.  Fully-specified
             component datasets (`str` or `DatasetType` instances) are always
             included.
+
+            Values other than `False` are deprecated, and only `False` will be
+            supported after v26.  After v27 this argument will be removed
+            entirely.
         missing : `list` of `str`, optional
             String dataset type names that were explicitly given (i.e. not
             regular expression patterns) but not found will be appended to this

diff --git a/python/lsst/daf/butler/registry/queries/_query_backend.py b/python/lsst/daf/butler/registry/queries/_query_backend.py
@@ -161,6 +161,10 @@ def resolve_single_dataset_type_wildcard(
             datasets were not matched by the expression.  Fully-specified
             component datasets (`str` or `DatasetType` instances) are always
             included.
+
+            Values other than `False` are deprecated, and only `False` will be
+            supported after v26.  After v27 this argument will be removed
+            entirely.
         explicit_only : `bool`, optional
             If `True`, require explicit `DatasetType` instances or `str` names,
             with `re.Pattern` instances deprecated and ``...`` prohibited.

diff --git a/python/lsst/daf/butler/registry/queries/_results.py b/python/lsst/daf/butler/registry/queries/_results.py
@@ -360,7 +360,12 @@ def constrain(self, query: SimpleQuery, columns: Callable[[str], sqlalchemy.sql.
             )
 
     def findDatasets(
-        self, datasetType: DatasetType | str, collections: Any, *, findFirst: bool = True
+        self,
+        datasetType: DatasetType | str,
+        collections: Any,
+        *,
+        findFirst: bool = True,
+        components: bool | None = None,
     ) -> ParentDatasetQueryResults:
         """Find datasets using the data IDs identified by this query.
 
@@ -380,6 +385,17 @@ def findDatasets(
             dataset type appears (according to the order of ``collections``
             passed in).  If `True`, ``collections`` must not contain regular
             expressions and may not be ``...``.
+        components : `bool`, optional
+            If `True`, apply all expression patterns to component dataset type
+            names as well.  If `False`, never apply patterns to components.  If
+            `None` (default), apply patterns to components only if their parent
+            datasets were not matched by the expression.  Fully-specified
+            component datasets (`str` or `DatasetType` instances) are always
+            included.
+
+            Values other than `False` are deprecated, and only `False` will be
+            supported after v26.  After v27 this argument will be removed
+            entirely.
 
         Returns
         -------
@@ -396,8 +412,8 @@ def findDatasets(
         MissingDatasetTypeError
             Raised if the given dataset type is not registered.
         """
-        parent_dataset_type, components = self._query.backend.resolve_single_dataset_type_wildcard(
-            datasetType, explicit_only=True
+        parent_dataset_type, components_found = self._query.backend.resolve_single_dataset_type_wildcard(
+            datasetType, components=components, explicit_only=True
         )
         if not parent_dataset_type.dimensions.issubset(self.graph):
             raise ValueError(
@@ -415,7 +431,7 @@ def findDatasets(
         return ParentDatasetQueryResults(
             db=self._db,
             query=query,
-            components=components,
+            components=components_found,
             records=self._records,
             datasetType=parent_dataset_type,
         )