Handle dataset components in queryDimensions and queryDatasets.

lsst · May 22, 2020 · 97367b7 · 97367b7
1 parent bee62ec
commit 97367b7
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 10 deletions.
diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py
@@ -26,11 +26,13 @@
     "Registry",
 )
 
+from collections import defaultdict
 import contextlib
 from dataclasses import dataclass
 import sys
 from typing import (
     Any,
+    Dict,
     Iterable,
     Iterator,
     List,
@@ -1135,6 +1137,7 @@ def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dim
                         collections: Any = None,
                         where: Optional[str] = None,
                         expand: bool = True,
+                        components: Optional[bool] = None,
                         **kwds) -> Iterator[DataCoordinate]:
         """Query for and iterate over data IDs matching user-provided criteria.
 
@@ -1173,6 +1176,13 @@ def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dim
         expand : `bool`, optional
             If `True` (default) yield `ExpandedDataCoordinate` instead of
             minimal `DataCoordinate` base-class instances.
+        components : `bool`, optional
+            If `True`, apply all dataset expression patterns to component
+            dataset type names as well.  If `False`, never apply patterns to
+            components.  If `None` (default), apply patterns to components only
+            if their parent datasets were not matched by the expression.
+            Fully-specified component datasets (`str` or `DatasetType`
+            instances) are always included.
         kwds
             Additional keyword arguments are forwarded to
             `DataCoordinate.standardize` when processing the ``dataId``
@@ -1187,14 +1197,21 @@ def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dim
         """
         dimensions = iterable(dimensions)
         standardizedDataId = self.expandDataId(dataId, **kwds)
-        standardizedDatasetTypes = []
+        standardizedDatasetTypes = set()
         requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
         if datasets is not None:
             if collections is None:
                 raise TypeError("Cannot pass 'datasets' without 'collections'.")
-            for datasetType in self.queryDatasetTypes(datasets):
+            for datasetType in self.queryDatasetTypes(datasets, components=components):
                 requestedDimensionNames.update(datasetType.dimensions.names)
-                standardizedDatasetTypes.append(datasetType)
+                # If any matched dataset type is a component, just operate on
+                # its parent instead, because Registry doesn't know anything
+                # about what components exist, and here (unlike queryDatasets)
+                # we don't care about returning them.
+                parentDatasetTypeName, componentName = datasetType.nameAndComponent()
+                if componentName is not None:
+                    datasetType = self.getDatasetType(parentDatasetTypeName)
+                standardizedDatasetTypes.add(datasetType)
             # Preprocess collections expression in case the original included
             # single-pass iterators (we'll want to use it multiple times
             # below).
@@ -1225,6 +1242,7 @@ def queryDatasets(self, datasetType: Any, *,
                       where: Optional[str] = None,
                       deduplicate: bool = False,
                       expand: bool = True,
+                      components: Optional[bool] = None,
                       **kwds) -> Iterator[DatasetRef]:
         """Query for and iterate over dataset references matching user-provided
         criteria.
@@ -1266,6 +1284,13 @@ def queryDatasets(self, datasetType: Any, *,
         expand : `bool`, optional
             If `True` (default) attach `ExpandedDataCoordinate` instead of
             minimal `DataCoordinate` base-class instances.
+        components : `bool`, optional
+            If `True`, apply all dataset expression patterns to component
+            dataset type names as well.  If `False`, never apply patterns to
+            components.  If `None` (default), apply patterns to components only
+            if their parent datasets were not matched by the expression.
+            Fully-specified component datasets (`str` or `DatasetType`
+            instances) are always included.
         kwds
             Additional keyword arguments are forwarded to
             `DataCoordinate.standardize` when processing the ``dataId``
@@ -1304,15 +1329,54 @@ def queryDatasets(self, datasetType: Any, *,
             collections = CollectionQuery.fromExpression(collections)
         # Standardize and expand the data ID provided as a constraint.
         standardizedDataId = self.expandDataId(dataId, **kwds)
-        # If the datasetType passed isn't actually a DatasetType, expand it
-        # (it could be an expression that yields multiple DatasetTypes) and
-        # recurse.
+
+        # We can only query directly if given a non-component DatasetType
+        # instance.  If we were given an expression or str or a component
+        # DatasetType instance, we'll populate this dict, recurse, and return.
+        # If we already have a non-component DatasetType, it will remain None
+        # and we'll run the query directly.
+        composition: Optional[
+            Dict[
+                DatasetType,  # parent dataset type
+                List[Optional[str]]  # component name, or None for parent
+            ]
+        ] = None
         if not isinstance(datasetType, DatasetType):
-            for trueDatasetType in self.queryDatasetTypes(datasetType):
-                yield from self.queryDatasets(trueDatasetType, collections=collections,
-                                              dimensions=dimensions, dataId=standardizedDataId,
-                                              where=where, deduplicate=deduplicate)
+            # We were given a dataset type expression (which may be as simple
+            # as a str).  Loop over all matching datasets, delegating handling
+            # of the `components` argument to queryDatasetTypes, as we populate
+            # the composition dict.
+            composition = defaultdict(list)
+            for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
+                parentName, componentName = trueDatasetType.nameAndComponent()
+                if componentName is not None:
+                    parentDatasetType = self.getDatasetType(parentName)
+                    composition.setdefault(parentDatasetType, []).append(componentName)
+                else:
+                    composition.setdefault(trueDatasetType, []).append(None)
+        elif datasetType.isComponent():
+            # We were given a true DatasetType instance, but it's a component.
+            # the composition dict will have exactly one item.
+            parentName, componentName = datasetType.nameAndComponent()
+            parentDatasetType = self.getDatasetType(parentName)
+            composition = {parentDatasetType: [componentName]}
+        if composition is not None:
+            # We need to recurse.  Do that once for each parent dataset type.
+            for parentDatasetType, componentNames in composition.items():
+                for parentRef in self.queryDatasets(parentDatasetType, collections=collections,
+                                                    dimensions=dimensions, dataId=standardizedDataId,
+                                                    where=where, deduplicate=deduplicate):
+                    # Loop over components, yielding one for each one for each
+                    # one requested.
+                    for componentName in componentNames:
+                        if componentName is None:
+                            yield parentRef
+                        else:
+                            yield parentRef.makeComponentRef(componentName)
             return
+        # If we get here, there's no need to recurse (or we are already
+        # recursing; there can only ever be one level of recursion).
+
         # The full set of dimensions in the query is the combination of those
         # needed for the DatasetType and those explicitly requested, if any.
         requestedDimensionNames = set(datasetType.dimensions.names)

diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py
@@ -32,6 +32,7 @@
 
 from ...core import (
     DataCoordinate,
+    DatasetRef,
     DatasetType,
     DimensionGraph,
     NamedValueSet,
@@ -457,6 +458,52 @@ def testDatasetTypeComponentQueries(self):
             NamedValueSet(registry.queryDatasetTypes(re.compile(r".+bias\.wcs"), components=True)).names
         )
 
+    def testComponentLookups(self):
+        """Test searching for component datasets via their parents.
+        """
+        registry = self.makeRegistry()
+        self.loadData(registry, "base.yaml")
+        self.loadData(registry, "datasets.yaml")
+        # Test getting the child dataset type (which does still exist in the
+        # Registry), and check for consistency with
+        # DatasetRef.makeComponentRef.
+        collection = "imported_g"
+        parentType = registry.getDatasetType("permabias")
+        childType = registry.getDatasetType("permabias.wcs")
+        parentRefResolved = registry.findDataset(parentType, collections=collection,
+                                                 instrument="Cam1", detector=1)
+        self.assertIsInstance(parentRefResolved, DatasetRef)
+        self.assertEqual(childType, parentRefResolved.makeComponentRef("wcs").datasetType)
+        # Search for a single dataset with findDataset.
+        childRef1 = registry.findDataset("permabias.wcs", collections=collection,
+                                         dataId=parentRefResolved.dataId)
+        self.assertEqual(childRef1, parentRefResolved.makeComponentRef("wcs"))
+        # Search for detector data IDs constrained by component dataset
+        # existence with queryDimensions.
+        dataIds = set(registry.queryDimensions(
+            ["detector"],
+            datasets=["permabias.wcs"],
+            collections=collection,
+            expand=False,
+        ))
+        self.assertEqual(
+            dataIds,
+            {
+                DataCoordinate.standardize(instrument="Cam1", detector=d, graph=parentType.dimensions)
+                for d in (1, 2, 3)
+            }
+        )
+        # Search for multiple datasets of a single type with queryDatasets.
+        childRefs2 = set(registry.queryDatasets(
+            "permabias.wcs",
+            collections=collection,
+            expand=False,
+        ))
+        self.assertEqual(
+            {ref.unresolved() for ref in childRefs2},
+            {DatasetRef(childType, dataId) for dataId in dataIds}
+        )
+
     def testCollections(self):
         """Tests for registry methods that manage collections.
         """