Merge pull request #651 from lsst/tickets/DM-33600

DM-33600: Improve error handling in registry query methods
lsst · Feb 27, 2022 · e5b471c · e5b471c
2 parents 837304b + e6d6b07
commit e5b471c
Show file tree

Hide file tree

Showing 19 changed files with 559 additions and 138 deletions.
diff --git a/doc/changes/DM-33600.api.rst b/doc/changes/DM-33600.api.rst
@@ -0,0 +1,2 @@
+Registry methods now raise exceptions belonging to a class hierarchy rooted at `lsst.daf.butler.registry.RegistryError`.
+See also :ref:`daf_butler_query_error_handling` for details.
diff --git a/doc/changes/README.rst b/doc/changes/README.rst
@@ -20,5 +20,5 @@ An example file name would therefore look like ``DM-30291.misc.rst``.
 
 If the change concerns specifically the registry or a datastore the news fragment can be placed in the relevant subdirectory.
 
-You can test how the content will be integrated into the release notes by running ``towncrier --draft --version=V.vv``.
+You can test how the content will be integrated into the release notes by running ``towncrier build --draft --version=V.vv``.
 ``towncrier`` can be installed from PyPI or conda-forge.
diff --git a/doc/lsst.daf.butler/queries.rst b/doc/lsst.daf.butler/queries.rst
@@ -410,3 +410,28 @@ Example of use of these two methods:
     # Print ten latest visit records in reverse time order
     for record in registry.queryDimensionRecords("visit").order_by("-timespan.begin").limit(10):
         print(record)
+
+.. _daf_butler_query_error_handling:
+
+Error handling with Registry methods
+------------------------------------
+
+`Registry` methods typically raise exceptions when they detect problems with input parameters.
+Documentation for these methods describes a set of exception classes and conditions in which exceptions are generated.
+In most cases, these exceptions belong to one of the special exception classes defined in `lsst.daf.butler.registry` module, e.g. `~lsst.daf.butler.registry.DataIdError`, which have `~lsst.daf.butler.registry.RegistryError` as a common base class.
+These exception classes are not exposed by the `lsst.daf.butler` module interface; to use these classes they need to be imported explicitly, e.g.:
+
+.. code-block:: Python
+
+    from lsst.daf.butler.registry import DataIdError, UserExpressionError
+
+While class documentation should list most commonly produced exceptions, there may be other exceptions raised by its methods.
+Code that needs to handle all types of exceptions generated by `Registry` methods should be prepared to handle other types of exceptions as well.
+
+A few of the `Registry` query methods (`~Registry.queryDataIds`, `~Registry.queryDatasets`, and `~Registry.queryDimensionRecords`) return result objects.
+These objects are iterables of the corresponding record types and typically they represent a non-empty result set.
+In some cases these methods can return empty results without generating an exception, for example due to a combination of constraints excluding all existing records.
+Result classes implement ``explain_no_results()`` method which can be used to try to identify the reason for an empty result.
+It returns a list of strings, with each string a human-readable message describing the reason for an empty result.
+This method does not always work reliably and can return an empty list even when result is empty.
+In particular it cannot analyze user expression and identify which part of that expression is responsible for an empty result.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -90,7 +90,11 @@
     CollectionSearch,
     CollectionType,
     ConflictingDefinitionError,
+    DataIdError,
+    DataIdValueError,
     DatasetIdGenEnum,
+    DimensionNameError,
+    InconsistentDataIdError,
     Registry,
     RegistryConfig,
     RegistryDefaults,
@@ -828,7 +832,7 @@ def _rewrite_data_id(
             never_found = set(not_dimensions) - matched_dims
 
             if never_found:
-                raise ValueError(f"Unrecognized keyword args given: {never_found}")
+                raise DimensionNameError(f"Unrecognized keyword args given: {never_found}")
 
             # There is a chance we have allocated a single dataId item
             # to multiple dimensions. Need to decide which should be retained.
@@ -890,8 +894,8 @@ def _rewrite_data_id(
                     # Get the actual record and compare with these values.
                     try:
                         recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
-                    except LookupError:
-                        raise ValueError(
+                    except DataIdError:
+                        raise DataIdValueError(
                             f"Could not find dimension '{dimensionName}'"
                             f" with dataId {newDataId} as part of comparing with"
                             f" record values {byRecord[dimensionName]}"
@@ -902,7 +906,7 @@ def _rewrite_data_id(
                             if (recval := getattr(recs[0], k)) != v:
                                 errmsg.append(f"{k}({recval} != {v})")
                         if errmsg:
-                            raise ValueError(
+                            raise InconsistentDataIdError(
                                 f"Dimension {dimensionName} in dataId has explicit value"
                                 " inconsistent with records: " + ", ".join(errmsg)
                             )
@@ -928,12 +932,12 @@ def _rewrite_data_id(
                         log.debug("Received %d records from constraints of %s", len(records), str(values))
                         for r in records:
                             log.debug("- %s", str(r))
-                        raise ValueError(
+                        raise InconsistentDataIdError(
                             f"DataId specification for dimension {dimensionName} is not"
                             f" uniquely constrained to a single dataset by {values}."
                             f" Got {len(records)} results."
                         )
-                    raise ValueError(
+                    raise InconsistentDataIdError(
                         f"DataId specification for dimension {dimensionName} matched no"
                         f" records when constrained by {values}"
                     )

diff --git a/python/lsst/daf/butler/registries/remote.py b/python/lsst/daf/butler/registries/remote.py
@@ -442,7 +442,7 @@ def queryCollections(
         collections = response.json()
         return collections
 
-    def queryDatasets(
+    def queryDatasets(  # type: ignore
         self,
         datasetType: Any,
         *,
@@ -488,7 +488,7 @@ def queryDatasets(
             DatasetRef.from_simple(SerializedDatasetRef(**r), universe=self.dimensions) for r in simple_refs
         )
 
-    def queryDataIds(
+    def queryDataIds(  # type: ignore
         self,
         dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str],
         *,
@@ -537,7 +537,7 @@ def queryDataIds(
             dataIds=dataIds, graph=DimensionGraph(self.dimensions, names=cleaned_dimensions)
         )
 
-    def queryDimensionRecords(
+    def queryDimensionRecords(  # type: ignore
         self,
         element: Union[DimensionElement, str],
         *,

diff --git a/python/lsst/daf/butler/registries/sql.py b/python/lsst/daf/butler/registries/sql.py
@@ -56,10 +56,18 @@
 )
 from ..core.utils import transactional
 from ..registry import (
+    ArgumentError,
+    CollectionExpressionError,
     CollectionSearch,
     CollectionType,
+    CollectionTypeError,
     ConflictingDefinitionError,
+    DataIdValueError,
+    DatasetTypeError,
+    DatasetTypeExpressionError,
+    DimensionNameError,
     InconsistentDataIdError,
+    NoDefaultCollectionError,
     OrphanedRecordError,
     Registry,
     RegistryConfig,
@@ -348,7 +356,7 @@ def getCollectionChain(self, parent: str) -> CollectionSearch:
         # Docstring inherited from lsst.daf.butler.registry.Registry
         record = self._managers.collections.find(parent)
         if record.type is not CollectionType.CHAINED:
-            raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
+            raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
         assert isinstance(record, ChainedCollectionRecord)
         return record.children
 
@@ -357,7 +365,7 @@ def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = Fals
         # Docstring inherited from lsst.daf.butler.registry.Registry
         record = self._managers.collections.find(parent)
         if record.type is not CollectionType.CHAINED:
-            raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
+            raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.")
         assert isinstance(record, ChainedCollectionRecord)
         children = CollectionSearch.fromExpression(children)
         if children != record.children or flatten:
@@ -425,7 +433,7 @@ def findDataset(
         )
         if collections is None:
             if not self.defaults.collections:
-                raise TypeError(
+                raise NoDefaultCollectionError(
                     "No collections provided to findDataset, and no defaults from registry construction."
                 )
             collections = self.defaults.collections
@@ -455,20 +463,22 @@ def insertDatasets(
         if isinstance(datasetType, DatasetType):
             storage = self._managers.datasets.find(datasetType.name)
             if storage is None:
-                raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
+                raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
         else:
             storage = self._managers.datasets.find(datasetType)
             if storage is None:
-                raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.")
+                raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.")
         if run is None:
             if self.defaults.run is None:
-                raise TypeError(
+                raise NoDefaultCollectionError(
                     "No run provided to insertDatasets, and no default from registry construction."
                 )
             run = self.defaults.run
         runRecord = self._managers.collections.find(run)
         if runRecord.type is not CollectionType.RUN:
-            raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.")
+            raise CollectionTypeError(
+                f"Given collection is of type {runRecord.type.name}; RUN collection required."
+            )
         assert isinstance(runRecord, RunRecord)
         progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG)
         if expand:
@@ -510,13 +520,13 @@ def _importDatasets(
         # find dataset type
         datasetTypes = set(dataset.datasetType for dataset in datasets)
         if len(datasetTypes) != 1:
-            raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}")
+            raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}")
         datasetType = datasetTypes.pop()
 
         # get storage handler for this dataset type
         storage = self._managers.datasets.find(datasetType.name)
         if storage is None:
-            raise LookupError(f"DatasetType '{datasetType}' has not been registered.")
+            raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.")
 
         # find run name
         runs = set(dataset.run for dataset in datasets)
@@ -525,14 +535,14 @@ def _importDatasets(
         run = runs.pop()
         if run is None:
             if self.defaults.run is None:
-                raise TypeError(
+                raise NoDefaultCollectionError(
                     "No run provided to ingestDatasets, and no default from registry construction."
                 )
             run = self.defaults.run
 
         runRecord = self._managers.collections.find(run)
         if runRecord.type is not CollectionType.RUN:
-            raise TypeError(
+            raise CollectionTypeError(
                 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};"
                 " RUN collection required."
             )
@@ -588,7 +598,9 @@ def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
         progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG)
         collectionRecord = self._managers.collections.find(collection)
         if collectionRecord.type is not CollectionType.TAGGED:
-            raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.")
+            raise CollectionTypeError(
+                f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED."
+            )
         for datasetType, refsForType in progress.iter_item_chunks(
             DatasetRef.groupByType(refs).items(), desc="Associating datasets by type"
         ):
@@ -609,7 +621,7 @@ def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None:
         progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG)
         collectionRecord = self._managers.collections.find(collection)
         if collectionRecord.type is not CollectionType.TAGGED:
-            raise TypeError(
+            raise CollectionTypeError(
                 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED."
             )
         for datasetType, refsForType in progress.iter_item_chunks(
@@ -681,9 +693,14 @@ def expandDataId(
             defaults = None
         else:
             defaults = self.defaults.dataId
-        standardized = DataCoordinate.standardize(
-            dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
-        )
+        try:
+            standardized = DataCoordinate.standardize(
+                dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs
+            )
+        except KeyError as exc:
+            # This means either kwargs have some odd name or required
+            # dimension is missing.
+            raise DimensionNameError(str(exc)) from exc
         if standardized.hasRecords():
             return standardized
         if records is None:
@@ -700,7 +717,9 @@ def expandDataId(
             if record is ...:
                 if isinstance(element, Dimension) and keys.get(element.name) is None:
                     if element in standardized.graph.required:
-                        raise LookupError(f"No value or null value for required dimension {element.name}.")
+                        raise DimensionNameError(
+                            f"No value or null value for required dimension {element.name}."
+                        )
                     keys[element.name] = None
                     record = None
                 else:
@@ -724,7 +743,7 @@ def expandDataId(
                         )
             else:
                 if element in standardized.graph.required:
-                    raise LookupError(
+                    raise DataIdValueError(
                         f"Could not fetch record for required dimension {element.name} via keys {keys}."
                     )
                 if element.alwaysJoin:
@@ -784,7 +803,10 @@ def queryDatasetTypes(
         missing: Optional[List[str]] = None,
     ) -> Iterator[DatasetType]:
         # Docstring inherited from lsst.daf.butler.registry.Registry
-        wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
+        try:
+            wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
+        except TypeError as exc:
+            raise DatasetTypeExpressionError(f"Invalid dataset type expression '{expression}'") from exc
         unknownComponentsMessage = (
             "Could not find definition for storage class %s for dataset type %r;"
             " if it has components they will not be included in dataset type query results."
@@ -856,7 +878,10 @@ def queryCollections(
         # Right now the datasetTypes argument is completely ignored, but that
         # is consistent with its [lack of] guarantees.  DM-24939 or a follow-up
         # ticket will take care of that.
-        query = CollectionQuery.fromExpression(expression)
+        try:
+            query = CollectionQuery.fromExpression(expression)
+        except TypeError as exc:
+            raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc
         collectionTypes = ensure_iterable(collectionTypes)
         for record in query.iter(
             self._managers.collections,
@@ -923,7 +948,7 @@ def queryDatasets(
         # Standardize the collections expression.
         if collections is None:
             if not self.defaults.collections:
-                raise TypeError(
+                raise NoDefaultCollectionError(
                     "No collections provided to findDataset, and no defaults from registry construction."
                 )
             collections = self.defaults.collections
@@ -1040,7 +1065,9 @@ def queryDataIds(
         if datasets is not None:
             if not collections:
                 if not self.defaults.collections:
-                    raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.")
+                    raise NoDefaultCollectionError(
+                        f"Cannot pass 'datasets' (='{datasets}') without 'collections'."
+                    )
                 collections = self.defaults.collections
             else:
                 # Preprocess collections expression in case the original
@@ -1057,7 +1084,7 @@ def queryDataIds(
                     datasetType = self.getDatasetType(parentDatasetTypeName)
                 standardizedDatasetTypes.add(datasetType)
         elif collections:
-            raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
+            raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
 
         def query_factory(
             order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None
@@ -1105,7 +1132,7 @@ def queryDimensionRecords(
             try:
                 element = self.dimensions[element]
             except KeyError as e:
-                raise KeyError(
+                raise DimensionNameError(
                     f"No such dimension '{element}', available dimensions: "
                     + str(self.dimensions.getStaticElements())
                 ) from e
@@ -1133,7 +1160,7 @@ def queryDatasetAssociations(
         # Docstring inherited from lsst.daf.butler.registry.Registry
         if collections is None:
             if not self.defaults.collections:
-                raise TypeError(
+                raise NoDefaultCollectionError(
                     "No collections provided to findDataset, and no defaults from registry construction."
                 )
             collections = self.defaults.collections