Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-32454: Separate dataset dimensions from requested dimensions in queries. #593

Merged
merged 1 commit into from
Nov 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/changes/DM-32454.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make it possible to run `queryDimensionRecords` while constraining on the existence of a dataset whose dimensions are not a subset of the record element's dependencies (e.g. `raw` and `exposure`).
6 changes: 3 additions & 3 deletions python/lsst/daf/butler/registries/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,7 @@ def queryDatasets(self, datasetType: Any, *,
bind=bind,
defaults=self.defaults.dataId,
check=check,
datasets=[datasetType],
)
builder = self._makeQueryBuilder(summary)
# Add the dataset subquery to the query, telling the QueryBuilder to
Expand All @@ -948,7 +949,6 @@ def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimens
standardizedDataId = self.expandDataId(dataId, **kwargs)
standardizedDatasetTypes = set()
requestedDimensions = self.dimensions.extract(dimensions)
queryDimensionNames = set(requestedDimensions.names)
missing: List[str] = []
if datasets is not None:
if not collections:
Expand All @@ -961,7 +961,6 @@ def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimens
# times below).
collections = CollectionQuery.fromExpression(collections)
for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing):
queryDimensionNames.update(datasetType.dimensions.names)
# If any matched dataset type is a component, just operate on
# its parent instead, because Registry doesn't know anything
# about what components exist, and here (unlike queryDatasets)
Expand All @@ -974,12 +973,13 @@ def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimens
raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")

summary = queries.QuerySummary(
requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
requested=requestedDimensions,
dataId=standardizedDataId,
expression=where,
bind=bind,
defaults=self.defaults.dataId,
check=check,
datasets=standardizedDatasetTypes,
)
builder = self._makeQueryBuilder(
summary,
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/registry/queries/_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def joinDataset(self, datasetType: DatasetType, collections: Any, *,
the inner join that would normally be present), the full query will
return no results.
"""
assert datasetType.dimensions.issubset(self.summary.requested)
assert datasetType in self.summary.datasets
if isResult and findFirst:
collections = CollectionSearch.fromExpression(collections)
else:
Expand Down
4 changes: 3 additions & 1 deletion python/lsst/daf/butler/registry/queries/_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
)
from ..interfaces import Database
from ._query import Query
from ._structs import QuerySummary


class DataCoordinateQueryResults(DataCoordinateIterable):
Expand Down Expand Up @@ -295,7 +296,6 @@ def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *
f"the DataCoordinateQueryResult used as input to the search, but "
f"{datasetType.name} has dimensions {datasetType.dimensions}, while the input "
f"dimensions are {self.graph}.")
builder = self._query.makeBuilder()
if datasetType.isComponent():
# We were given a true DatasetType instance, but it's a component.
parentName, componentName = datasetType.nameAndComponent()
Expand All @@ -304,6 +304,8 @@ def findDatasets(self, datasetType: Union[DatasetType, str], collections: Any, *
components = [componentName]
else:
components = [None]
summary = QuerySummary(self.graph, whereRegion=self._query.whereRegion, datasets=[datasetType])
builder = self._query.makeBuilder(summary)
builder.joinDataset(datasetType, collections=collections, findFirst=findFirst)
query = builder.finish(joinMissing=False)
return ParentDatasetQueryResults(db=self._db, query=query, components=components,
Expand Down
17 changes: 16 additions & 1 deletion python/lsst/daf/butler/registry/queries/_structs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
__all__ = ["QuerySummary", "RegistryManagers"] # other classes here are local to subpackage

from dataclasses import dataclass
from typing import AbstractSet, Any, Iterator, List, Mapping, Optional, Type, Union
from typing import AbstractSet, Any, Iterable, Iterator, List, Mapping, Optional, Type, Union

from sqlalchemy.sql import ColumnElement

Expand Down Expand Up @@ -266,6 +266,12 @@ class QuerySummary:
query expression, keyed by the identifiers they replace.
defaults : `DataCoordinate`, optional
A data ID containing default for governor dimensions.
datasets : `Iterable` [ `DatasetType` ], optional
Dataset types whose searches may be joined into the query. Callers
must still call `QueryBuilder.joinDataset` explicitly to control how
that join happens (e.g. which collections are searched), but by
declaring them here first we can ensure that the query includes the
right dimensions for those joins.
check : `bool`
If `True` (default) check the query for consistency. This may reject
some valid queries that resemble common mistakes (e.g. queries for
Expand All @@ -277,6 +283,7 @@ def __init__(self, requested: DimensionGraph, *,
whereRegion: Optional[Region] = None,
bind: Optional[Mapping[str, Any]] = None,
defaults: Optional[DataCoordinate] = None,
datasets: Iterable[DatasetType] = (),
check: bool = True):
self.requested = requested
if expression is None:
Expand All @@ -287,6 +294,7 @@ def __init__(self, requested: DimensionGraph, *,
raise TypeError("New bind parameters passed, but expression is already a QueryWhereExpression.")
self.where = expression.attach(self.requested, dataId=dataId, region=whereRegion, defaults=defaults,
check=check)
self.datasets = NamedValueSet(datasets).freeze()

requested: DimensionGraph
"""Dimensions whose primary keys should be included in the result rows of
Expand All @@ -298,6 +306,11 @@ def __init__(self, requested: DimensionGraph, *,
query (`QueryWhereClause`).
"""

datasets: NamedValueAbstractSet[DatasetType]
"""Dataset types whose searches may be joined into the query
(`NamedValueAbstractSet` [ `DatasetType` ]).
"""

@property
def universe(self) -> DimensionUniverse:
"""All known dimensions (`DimensionUniverse`).
Expand Down Expand Up @@ -364,6 +377,8 @@ def mustHaveKeysJoined(self) -> DimensionGraph:
dataset.
"""
names = set(self.requested.names | self.where.dimensions.names)
for dataset_type in self.datasets:
names.update(dataset_type.dimensions.names)
return DimensionGraph(self.universe, names=names)

@property # type: ignore
Expand Down
17 changes: 17 additions & 0 deletions python/lsst/daf/butler/registry/tests/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2474,3 +2474,20 @@ def testQueryResultSummaries(self):
for message in messages
)
)

def testDatasetConstrainedDimensionRecordQueries(self):
"""Test that queryDimensionRecords works even when given a dataset
constraint whose dimensions extend beyond the requested dimension
element's.
"""
registry = self.makeRegistry()
self.loadData(registry, "base.yaml")
self.loadData(registry, "datasets.yaml")
# Query for physical_filter dimension records, using a dataset that
# has both physical_filter and dataset dimensions.
records = registry.queryDimensionRecords(
"physical_filter",
datasets=["flat"],
collections="imported_r",
)
self.assertEqual({record.name for record in records}, {"Cam1-R1", "Cam1-R2"})