Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-26590: Check query expressions for completeness and consistency #433

Merged
merged 5 commits into from
Nov 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
25 changes: 23 additions & 2 deletions python/lsst/daf/butler/registry/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1338,6 +1338,7 @@ def queryDatasets(self, datasetType: Any, *,
where: Optional[str] = None,
findFirst: bool = False,
components: Optional[bool] = None,
check: bool = True,
**kwargs: Any) -> queries.DatasetQueryResults:
"""Query for and iterate over dataset references matching user-provided
criteria.
Expand Down Expand Up @@ -1385,6 +1386,11 @@ def queryDatasets(self, datasetType: Any, *,
if their parent datasets were not matched by the expression.
Fully-specified component datasets (`str` or `DatasetType`
instances) are always included.
check : `bool`, optional
If `True` (default) check the query for consistency before
executing it. This may reject some valid queries that resemble
common mistakes (e.g. queries for visits without specifying an
instrument).
**kwargs
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``dataId``
Expand Down Expand Up @@ -1462,7 +1468,8 @@ def queryDatasets(self, datasetType: Any, *,
dimensions=dimensions,
dataId=standardizedDataId,
where=where,
findFirst=findFirst
findFirst=findFirst,
check=check,
)
if isinstance(parentResults, queries.ParentDatasetQueryResults):
chain.append(
Expand All @@ -1486,6 +1493,7 @@ def queryDatasets(self, datasetType: Any, *,
requested=DimensionGraph(self.dimensions, names=requestedDimensionNames),
dataId=standardizedDataId,
expression=where,
check=check,
)
builder = self.makeQueryBuilder(summary)
# Add the dataset subquery to the query, telling the QueryBuilder to
Expand All @@ -1504,6 +1512,7 @@ def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimens
collections: Any = None,
where: Optional[str] = None,
components: Optional[bool] = None,
check: bool = True,
**kwargs: Any) -> queries.DataCoordinateQueryResults:
"""Query for data IDs matching user-provided criteria.

Expand Down Expand Up @@ -1546,6 +1555,11 @@ def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimens
if their parent datasets were not matched by the expression.
Fully-specified component datasets (`str` or `DatasetType`
instances) are always included.
check : `bool`, optional
If `True` (default) check the query for consistency before
executing it. This may reject some valid queries that resemble
common mistakes (e.g. queries for visits without specifying an
instrument).
**kwargs
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``dataId``
Expand Down Expand Up @@ -1592,6 +1606,7 @@ def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimens
requested=DimensionGraph(self.dimensions, names=queryDimensionNames),
dataId=standardizedDataId,
expression=where,
check=check,
)
builder = self.makeQueryBuilder(summary)
for datasetType in standardizedDatasetTypes:
Expand All @@ -1605,6 +1620,7 @@ def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
collections: Any = None,
where: Optional[str] = None,
components: Optional[bool] = None,
check: bool = True,
**kwargs: Any) -> Iterator[DimensionRecord]:
"""Query for dimension information matching user-provided criteria.

Expand All @@ -1630,6 +1646,11 @@ def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
components : `bool`, optional
Whether to apply dataset expressions to components as well.
See `queryDataIds` for more information.
check : `bool`, optional
If `True` (default) check the query for consistency before
executing it. This may reject some valid queries that resemble
common mistakes (e.g. queries for visits without specifying an
instrument).
**kwargs
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``dataId``
Expand All @@ -1644,7 +1665,7 @@ def queryDimensionRecords(self, element: Union[DimensionElement, str], *,
if not isinstance(element, DimensionElement):
element = self.dimensions[element]
dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections,
where=where, components=components, **kwargs)
where=where, components=components, check=check, **kwargs)
return iter(self._dimensions[element].fetch(dataIds))

def queryDatasetAssociations(
Expand Down
20 changes: 10 additions & 10 deletions python/lsst/daf/butler/registry/queries/_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,12 +386,12 @@ def _addWhereClause(self) -> None:
For internal use by `QueryBuilder` only; will be called (and should
only by called) by `finish`.
"""
if self.summary.expression.tree is not None:
if self.summary.where.tree is not None:
visitor = ClauseVisitor(self.summary.universe, self._columns, self._elements)
self._simpleQuery.where.append(self.summary.expression.tree.visit(visitor))
self._simpleQuery.where.append(self.summary.where.tree.visit(visitor))
for dimension, columnsInQuery in self._columns.keys.items():
if dimension in self.summary.dataId.graph:
givenKey = self.summary.dataId[dimension]
if dimension in self.summary.where.dataId.graph:
givenKey = self.summary.where.dataId[dimension]
# Add a WHERE term for each column that corresponds to each
# key. This is redundant with the JOIN ON clauses that make
# them equal to each other, but more constraints have a chance
Expand All @@ -401,22 +401,22 @@ def _addWhereClause(self) -> None:
else:
# Dimension is not fully identified, but it might be a skypix
# dimension that's constrained by a given region.
if self.summary.whereRegion is not None and isinstance(dimension, SkyPixDimension):
if self.summary.where.region is not None and isinstance(dimension, SkyPixDimension):
# We know the region now.
givenSkyPixIds: List[int] = []
for begin, end in dimension.pixelization.envelope(self.summary.whereRegion):
for begin, end in dimension.pixelization.envelope(self.summary.where.region):
givenSkyPixIds.extend(range(begin, end))
for columnInQuery in columnsInQuery:
self._simpleQuery.where.append(columnInQuery.in_(givenSkyPixIds))
# If we are given an dataId with a timespan, and there are one or more
# timespans in the query that aren't given, add a WHERE expression for
# each of them.
if self.summary.dataId.graph.temporal and self.summary.temporal:
if self.summary.where.dataId.graph.temporal and self.summary.temporal:
# Timespan is known now.
givenInterval = self.summary.dataId.timespan
givenInterval = self.summary.where.dataId.timespan
assert givenInterval is not None
for element, intervalInQuery in self._columns.timespans.items():
assert element not in self.summary.dataId.graph.elements
assert element not in self.summary.where.dataId.graph.elements
self._simpleQuery.where.append(intervalInQuery.overlaps(givenInterval))

def finish(self, joinMissing: bool = True) -> Query:
Expand Down Expand Up @@ -445,7 +445,7 @@ def finish(self, joinMissing: bool = True) -> Query:
return EmptyQuery(self.summary.requested.universe, managers=self._managers)
return DirectQuery(graph=self.summary.requested,
uniqueness=DirectQueryUniqueness.NOT_UNIQUE,
whereRegion=self.summary.dataId.region,
whereRegion=self.summary.where.dataId.region,
simpleQuery=self._simpleQuery,
columns=self._columns,
managers=self._managers)