Separate dataset dimensions from requested dimensions in queries.

The dimensions passed directly to queryDataIds were always intended to be the dimensions the caller wanted back, which may not be the same as the dimensions used to constrain the query. We'd partially handled that distinction in the past, but we had been rolling dataset constraint dimensions into the requested set. This was most obvious in queryDimensionRecords, which didn't quite handle this properly, and hence couldn't handle constraints on datasets that have dimensions beyond the dimension records' own. Fixing queryDataIds to track dataset-constraint dimensions separately solves that problem, too.
lsst · Nov 4, 2021 · 1f0a257 · 1f0a257
1 parent 1393936
commit 1f0a257
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 6 deletions.
diff --git a/doc/changes/DM-32454.bugfix.md b/doc/changes/DM-32454.bugfix.md
@@ -0,0 +1 @@
+Make it possible to run `queryDimensionRecords` while constraining on the existence of a dataset whose dimensions are not a subset of the record element's dependencies (e.g. `raw` and `exposure`).
diff --git a/python/lsst/daf/butler/registries/sql.py b/python/lsst/daf/butler/registries/sql.py
@@ -911,6 +911,7 @@ def queryDatasets(self, datasetType: Any, *,
             bind=bind,
             defaults=self.defaults.dataId,
             check=check,
+            datasets=[datasetType],
         )
         builder = self._makeQueryBuilder(summary)
         # Add the dataset subquery to the query, telling the QueryBuilder to
@@ -936,7 +937,6 @@ def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimens
         standardizedDataId = self.expandDataId(dataId, **kwargs)
         standardizedDatasetTypes = set()
         requestedDimensions = self.dimensions.extract(dimensions)
-        queryDimensionNames = set(requestedDimensions.names)
         if datasets is not None:
             if not collections:
                 if not self.defaults.collections:
@@ -948,7 +948,6 @@ def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimens
                 # times below).
                 collections = CollectionQuery.fromExpression(collections)
             for datasetType in self.queryDatasetTypes(datasets, components=components):
-                queryDimensionNames.update(datasetType.dimensions.names)
                 # If any matched dataset type is a component, just operate on
                 # its parent instead, because Registry doesn't know anything
                 # about what components exist, and here (unlike queryDatasets)
@@ -961,12 +960,13 @@ def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimens
             raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
 
         summary = queries.QuerySummary(
-            requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
+            requested=requestedDimensions,
             dataId=standardizedDataId,
             expression=where,
             bind=bind,
             defaults=self.defaults.dataId,
             check=check,
+            datasets=standardizedDatasetTypes,
         )
         builder = self._makeQueryBuilder(summary)
         for datasetType in standardizedDatasetTypes:

diff --git a/python/lsst/daf/butler/registry/queries/_builder.py b/python/lsst/daf/butler/registry/queries/_builder.py
@@ -143,7 +143,7 @@ def joinDataset(self, datasetType: DatasetType, collections: Any, *,
             the inner join that would normally be present), the full query will
             return no results.
         """
-        assert datasetType.dimensions.issubset(self.summary.requested)
+        assert datasetType in self.summary.datasets
         if isResult and findFirst:
             collections = CollectionSearch.fromExpression(collections)
         else:

diff --git a/python/lsst/daf/butler/registry/queries/_results.py b/python/lsst/daf/butler/registry/queries/_results.py
@@ -55,6 +55,7 @@
 )
 from ..interfaces import Database
 from ._query import Query
+from ._structs import QuerySummary
 
 
 class DataCoordinateQueryResults(DataCoordinateIterable):
@@ -295,7 +296,8 @@ def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *
                              f"the DataCoordinateQueryResult used as input to the search, but "
                              f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
                              f"dimensions are {self.graph}.")
-        builder = self._query.makeBuilder()
+        summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
+        builder = self._query.makeBuilder(summary)
         if datasetType.isComponent():
             # We were given a true DatasetType instance, but it's a component.
             parentName, componentName = datasetType.nameAndComponent()

diff --git a/python/lsst/daf/butler/registry/queries/_structs.py b/python/lsst/daf/butler/registry/queries/_structs.py
@@ -23,7 +23,7 @@
 __all__ = ["QuerySummary", "RegistryManagers"]  # other classes here are local to subpackage
 
 from dataclasses import dataclass
-from typing import AbstractSet, Any, Iterator, List, Mapping, Optional, Type, Union
+from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Type, Union
 
 from sqlalchemy.sql import ColumnElement
 
@@ -266,6 +266,12 @@ class QuerySummary:
         query expression, keyed by the identifiers they replace.
     defaults : `DataCoordinate`, optional
         A data ID containing default for governor dimensions.
+    datasets : `Iterable` [ `DatasetType` ], optional
+        Dataset types whose searches may be joined into the query.  Callers
+        must still call `QueryBuilder.joinDataset` explictly to control how
+        than join happens (e.g. which collections are searched), but by
+        declaring them here first we can ensure that the query includes the
+        right dimensions for those joins.
     check : `bool`
         If `True` (default) check the query for consistency.  This may reject
         some valid queries that resemble common mistakes (e.g. queries for
@@ -277,6 +283,7 @@ def __init__(self, requested: DimensionGraph, *,
                  whereRegion: Optional[Region] = None,
                  bind: Optional[Mapping[str, Any]] = None,
                  defaults: Optional[DataCoordinate] = None,
+                 datasets: Iterable[DatasetType] = (),
                  check: bool = True):
         self.requested = requested
         if expression is None:
@@ -287,6 +294,7 @@ def __init__(self, requested: DimensionGraph, *,
             raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
         self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, defaults=defaults,
                                        check=check)
+        self.datasets = NamedValueSet(datasets).freeze()
 
     requested: DimensionGraph
     """Dimensions whose primary keys should be included in the result rows of
@@ -298,6 +306,11 @@ def __init__(self, requested: DimensionGraph, *,
     query (`QueryWhereClause`).
     """
 
+    datasets: NamedValueAbstractSet[DatasetType]
+    """Dataset types whose searches may be joined into the query
+    (`NamedValueAbstractSet` [ `DatasetType` ]).
+    """
+
     @property
     def universe(self) -> DimensionUniverse:
         """All known dimensions (`DimensionUniverse`).
@@ -364,6 +377,8 @@ def mustHaveKeysJoined(self) -> DimensionGraph:
         dataset.
         """
         names = set(self.requested.names | self.where.dimensions.names)
+        for dataset_type in self.datasets:
+            names.update(dataset_type.dimensions.names)
         return DimensionGraph(self.universe, names=names)
 
     @property  # type: ignore

diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py
@@ -2467,3 +2467,20 @@ def testQueryResultSummaries(self):
                 for message in messages
             )
         )
+
+    def testDatasetConstrainedDimensionRecordQueries(self):
+        """Test that queryDimensionRecords works even when given a dataset
+        constraint whose dimensions extend beyond the requested dimension
+        element's.
+        """
+        registry = self.makeRegistry()
+        self.loadData(registry, "base.yaml")
+        self.loadData(registry, "datasets.yaml")
+        # Query for physical_filter dimension records, using a dataset that
+        # has both physical_filter
+        records = registry.queryDimensionRecords(
+            "physical_filter",
+            datasets=["flat"],
+            collections="imported_r",
+        )
+        self.assertEqual({record.name for record in records}, {"Cam1-R1", "Cam1-R2"})