Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-38915: Fix handling of empty collection lists #835

Merged
merged 2 commits into from
May 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/changes/DM-38915.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Few registry methods treated empty collection list in the same way as `None`, meaning that Registry-default run collection was used.
This has been fixed now to mean that queries always return empty result set, with explicit "doomed by" messages.
2 changes: 1 addition & 1 deletion doc/lsst.daf.butler/queries.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Arguments that specify one or more collections are similar to those for dataset
- `str` values (the full collection name);
- `str` values using glob wildcard syntax which will be converted to `re.Pattern`;
- `re.Pattern` values (matched to the collection name, via `~re.Pattern.fullmatch`);
- iterables of any of the above;
- iterables of any of the above, empty collection cannot match anything, methods always return an empty result set in this case;
- the special value "``...``", which matches all collections;

Collection expressions are processed by the `~registry.wildcards.CollectionWildcard` class.
Expand Down
27 changes: 16 additions & 11 deletions python/lsst/daf/butler/registries/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import httpx
from lsst.daf.butler import __version__
from lsst.resources import ResourcePath, ResourcePathExpression
from lsst.utils.ellipsis import Ellipsis
from lsst.utils.introspection import get_full_type_name
from lsst.utils.iteration import ensure_iterable

Expand Down Expand Up @@ -69,6 +70,7 @@

if TYPE_CHECKING:
from .._butlerConfig import ButlerConfig
from ..registry._registry import CollectionArgType
from ..registry.interfaces import CollectionRecord, DatastoreRegistryBridgeManager


Expand Down Expand Up @@ -335,7 +337,7 @@
datasetType: DatasetType | str,
dataId: DataId | None = None,
*,
collections: Any = None,
collections: CollectionArgType | None = None,
timespan: Timespan | None = None,
**kwargs: Any,
) -> DatasetRef | None:
Expand Down Expand Up @@ -524,7 +526,7 @@
self,
datasetType: Any,
*,
collections: Any = None,
collections: CollectionArgType | None = None,
dimensions: Iterable[Dimension | str] | None = None,
dataId: DataId | None = None,
where: str = "",
Expand All @@ -538,12 +540,13 @@
if dimensions is not None:
dimensions = [str(d) for d in ensure_iterable(dimensions)]

collections_param: ExpressionQueryParameter | None = None
if collections is not None:
collections = ExpressionQueryParameter.from_expression(collections)
collections_param = ExpressionQueryParameter.from_expression(collections)

parameters = QueryDatasetsModel(
datasetType=ExpressionQueryParameter.from_expression(datasetType),
collections=collections,
collections=collections_param,
dimensions=dimensions,
dataId=self._simplify_dataId(dataId),
where=where,
Expand Down Expand Up @@ -572,7 +575,7 @@
*,
dataId: DataId | None = None,
datasets: Any = None,
collections: Any = None,
collections: CollectionArgType | None = None,
where: str = "",
components: bool | None = None,
bind: Mapping[str, Any] | None = None,
Expand All @@ -582,15 +585,16 @@
# Docstring inherited from lsst.daf.butler.registry.Registry
cleaned_dimensions = [str(d) for d in ensure_iterable(dimensions)]

collections_param: ExpressionQueryParameter | None = None
if collections is not None:
collections = ExpressionQueryParameter.from_expression(collections)
collections_param = ExpressionQueryParameter.from_expression(collections)

Check warning on line 590 in python/lsst/daf/butler/registries/remote.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/registries/remote.py#L590

Added line #L590 was not covered by tests
if datasets is not None:
datasets = DatasetsQueryParameter.from_expression(datasets)

parameters = QueryDataIdsModel(
dimensions=cleaned_dimensions,
dataId=self._simplify_dataId(dataId),
collections=collections,
collections=collections_param,
datasets=datasets,
where=where,
components=components,
Expand Down Expand Up @@ -621,23 +625,24 @@
*,
dataId: DataId | None = None,
datasets: Any = None,
collections: Any = None,
collections: CollectionArgType | None = None,
where: str = "",
components: bool | None = None,
bind: Mapping[str, Any] | None = None,
check: bool = True,
**kwargs: Any,
) -> Iterator[DimensionRecord]:
# Docstring inherited from lsst.daf.butler.registry.Registry
collections_param: ExpressionQueryParameter | None = None
if collections is not None:
collections = ExpressionQueryParameter.from_expression(collections)
collections_param = ExpressionQueryParameter.from_expression(collections)

Check warning on line 638 in python/lsst/daf/butler/registries/remote.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/registries/remote.py#L638

Added line #L638 was not covered by tests
if datasets is not None:
datasets = DatasetsQueryParameter.from_expression(datasets)

parameters = QueryDimensionRecordsModel(
dataId=self._simplify_dataId(dataId),
datasets=datasets,
collections=collections,
collections=collections_param,
where=where,
components=components,
bind=bind,
Expand All @@ -661,7 +666,7 @@
def queryDatasetAssociations(
self,
datasetType: str | DatasetType,
collections: Any = ...,
collections: CollectionArgType | None = Ellipsis,
*,
collectionTypes: Iterable[CollectionType] = CollectionType.all(),
flattenChains: bool = False,
Expand Down
54 changes: 34 additions & 20 deletions python/lsst/daf/butler/registries/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import sqlalchemy
from lsst.daf.relation import LeafRelation, Relation
from lsst.resources import ResourcePathExpression
from lsst.utils.ellipsis import Ellipsis
from lsst.utils.iteration import ensure_iterable

from ..core import (
Expand Down Expand Up @@ -96,6 +97,7 @@

if TYPE_CHECKING:
from .._butlerConfig import ButlerConfig
from ..registry._registry import CollectionArgType
from ..registry.interfaces import (
CollectionRecord,
Database,
Expand Down Expand Up @@ -452,7 +454,7 @@ def findDataset(
datasetType: Union[DatasetType, str],
dataId: Optional[DataId] = None,
*,
collections: Any = None,
collections: CollectionArgType | None = None,
timespan: Optional[Timespan] = None,
**kwargs: Any,
) -> Optional[DatasetRef]:
Expand All @@ -465,6 +467,8 @@ def findDataset(
collections = self.defaults.collections
backend = queries.SqlQueryBackend(self._db, self._managers)
collection_wildcard = CollectionWildcard.from_expression(collections, require_ordered=True)
if collection_wildcard.empty():
return None
matched_collections = backend.resolve_collection_wildcard(collection_wildcard)
parent_dataset_type, components = backend.resolve_single_dataset_type_wildcard(
datasetType, components_deprecated=False
Expand Down Expand Up @@ -1037,7 +1041,7 @@ def _standardize_query_data_id_args(
def _standardize_query_dataset_args(
self,
datasets: Any,
collections: Any,
collections: CollectionArgType | None,
components: bool | None,
mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain",
*,
Expand Down Expand Up @@ -1091,16 +1095,17 @@ def _standardize_query_dataset_args(
Processed collection expression.
"""
composition: dict[DatasetType, list[str | None]] = {}
collection_wildcard: CollectionWildcard | None = None
if datasets is not None:
if not collections:
if collections is None:
if not self.defaults.collections:
raise NoDefaultCollectionError("No collections, and no registry default collections.")
collections = self.defaults.collections
collection_wildcard = CollectionWildcard.from_expression(self.defaults.collections)
else:
collections = CollectionWildcard.from_expression(collections)
if mode == "find_first" and collections.patterns:
collection_wildcard = CollectionWildcard.from_expression(collections)
if mode == "find_first" and collection_wildcard.patterns:
raise TypeError(
f"Collection pattern(s) {collections.patterns} not allowed in this context."
f"Collection pattern(s) {collection_wildcard.patterns} not allowed in this context."
)
missing: list[str] = []
composition = self._managers.datasets.resolve_wildcard(
Expand All @@ -1115,14 +1120,16 @@ def _standardize_query_dataset_args(
)
doomed_by.extend(f"Dataset type {name} is not registered." for name in missing)
elif collections:
# I think this check should actually be `collections is not None`,
# but it looks like some CLI scripts use empty tuple as default.
raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
return composition, collections
return composition, collection_wildcard

def queryDatasets(
self,
datasetType: Any,
*,
collections: Any = None,
collections: CollectionArgType | None = None,
dimensions: Optional[Iterable[Union[Dimension, str]]] = None,
dataId: Optional[DataId] = None,
where: str = "",
Expand All @@ -1135,13 +1142,16 @@ def queryDatasets(
# Docstring inherited from lsst.daf.butler.registry.Registry
doomed_by: list[str] = []
data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
dataset_composition, collections = self._standardize_query_dataset_args(
dataset_composition, collection_wildcard = self._standardize_query_dataset_args(
datasetType,
collections,
components,
mode="find_first" if findFirst else "find_all",
doomed_by=doomed_by,
)
if collection_wildcard is not None and collection_wildcard.empty():
doomed_by.append("No datasets can be found because collection list is empty.")
return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by)
parent_results: list[queries.ParentDatasetQueryResults] = []
for parent_dataset_type, components_for_parent in dataset_composition.items():
# The full set of dimensions in the query is the combination of
Expand All @@ -1168,7 +1178,7 @@ def queryDatasets(
# only if we need to findFirst. Note that if any of the
# collections are actually wildcard expressions, and
# findFirst=True, this will raise TypeError for us.
builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst)
builder.joinDataset(parent_dataset_type, collection_wildcard, isResult=True, findFirst=findFirst)
query = builder.finish()
parent_results.append(
queries.ParentDatasetQueryResults(
Expand All @@ -1193,7 +1203,7 @@ def queryDataIds(
*,
dataId: Optional[DataId] = None,
datasets: Any = None,
collections: Any = None,
collections: CollectionArgType | None = None,
where: str = "",
components: Optional[bool] = None,
bind: Optional[Mapping[str, Any]] = None,
Expand All @@ -1205,9 +1215,11 @@ def queryDataIds(
requestedDimensions = self.dimensions.extract(dimensions)
doomed_by: list[str] = []
data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
dataset_composition, collections = self._standardize_query_dataset_args(
dataset_composition, collection_wildcard = self._standardize_query_dataset_args(
datasets, collections, components, doomed_by=doomed_by
)
if collection_wildcard is not None and collection_wildcard.empty():
doomed_by.append("No data coordinates can be found because collection list is empty.")
summary = queries.QuerySummary(
requested=requestedDimensions,
column_types=self._managers.column_types,
Expand All @@ -1220,7 +1232,7 @@ def queryDataIds(
)
builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
for datasetType in dataset_composition.keys():
builder.joinDataset(datasetType, collections, isResult=False)
builder.joinDataset(datasetType, collection_wildcard, isResult=False)
query = builder.finish()

return queries.DataCoordinateQueryResults(query)
Expand All @@ -1231,7 +1243,7 @@ def queryDimensionRecords(
*,
dataId: Optional[DataId] = None,
datasets: Any = None,
collections: Any = None,
collections: CollectionArgType | None = None,
where: str = "",
components: Optional[bool] = None,
bind: Optional[Mapping[str, Any]] = None,
Expand All @@ -1249,9 +1261,11 @@ def queryDimensionRecords(
) from e
doomed_by: list[str] = []
data_id = self._standardize_query_data_id_args(dataId, doomed_by=doomed_by, **kwargs)
dataset_composition, collections = self._standardize_query_dataset_args(
dataset_composition, collection_wildcard = self._standardize_query_dataset_args(
datasets, collections, components, doomed_by=doomed_by
)
if collection_wildcard is not None and collection_wildcard.empty():
doomed_by.append("No dimension records can be found because collection list is empty.")
summary = queries.QuerySummary(
requested=element.graph,
column_types=self._managers.column_types,
Expand All @@ -1264,14 +1278,14 @@ def queryDimensionRecords(
)
builder = self._makeQueryBuilder(summary, doomed_by=doomed_by)
for datasetType in dataset_composition.keys():
builder.joinDataset(datasetType, collections, isResult=False)
builder.joinDataset(datasetType, collection_wildcard, isResult=False)
query = builder.finish().with_record_columns(element)
return queries.DatabaseDimensionRecordQueryResults(query, element)

def queryDatasetAssociations(
self,
datasetType: Union[str, DatasetType],
collections: Any = ...,
collections: CollectionArgType | None = Ellipsis,
*,
collectionTypes: Iterable[CollectionType] = CollectionType.all(),
flattenChains: bool = False,
Expand All @@ -1284,13 +1298,13 @@ def queryDatasetAssociations(
"and no defaults from registry construction."
)
collections = self.defaults.collections
collections = CollectionWildcard.from_expression(collections)
collection_wildcard = CollectionWildcard.from_expression(collections)
backend = queries.SqlQueryBackend(self._db, self._managers)
parent_dataset_type, _ = backend.resolve_single_dataset_type_wildcard(datasetType, components=False)
timespan_tag = DatasetColumnTag(parent_dataset_type.name, "timespan")
collection_tag = DatasetColumnTag(parent_dataset_type.name, "collection")
for parent_collection_record in backend.resolve_collection_wildcard(
collections,
collection_wildcard,
collection_types=frozenset(collectionTypes),
flatten_chains=flattenChains,
):
Expand Down