Skip to content

Commit

Permalink
Handle dataset components in queryDimensions and queryDatasets.
Browse files Browse the repository at this point in the history
  • Loading branch information
TallJimbo committed May 22, 2020
1 parent bee62ec commit 97367b7
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 10 deletions.
84 changes: 74 additions & 10 deletions python/lsst/daf/butler/registry/_registry.py
Expand Up @@ -26,11 +26,13 @@
"Registry",
)

from collections import defaultdict
import contextlib
from dataclasses import dataclass
import sys
from typing import (
Any,
Dict,
Iterable,
Iterator,
List,
Expand Down Expand Up @@ -1135,6 +1137,7 @@ def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dim
collections: Any = None,
where: Optional[str] = None,
expand: bool = True,
components: Optional[bool] = None,
**kwds) -> Iterator[DataCoordinate]:
"""Query for and iterate over data IDs matching user-provided criteria.
Expand Down Expand Up @@ -1173,6 +1176,13 @@ def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dim
expand : `bool`, optional
If `True` (default) yield `ExpandedDataCoordinate` instead of
minimal `DataCoordinate` base-class instances.
components : `bool`, optional
If `True`, apply all dataset expression patterns to component
dataset type names as well. If `False`, never apply patterns to
components. If `None` (default), apply patterns to components only
if their parent datasets were not matched by the expression.
Fully-specified component datasets (`str` or `DatasetType`
instances) are always included.
kwds
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``dataId``
Expand All @@ -1187,14 +1197,21 @@ def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dim
"""
dimensions = iterable(dimensions)
standardizedDataId = self.expandDataId(dataId, **kwds)
standardizedDatasetTypes = []
standardizedDatasetTypes = set()
requestedDimensionNames = set(self.dimensions.extract(dimensions).names)
if datasets is not None:
if collections is None:
raise TypeError("Cannot pass 'datasets' without 'collections'.")
for datasetType in self.queryDatasetTypes(datasets):
for datasetType in self.queryDatasetTypes(datasets, components=components):
requestedDimensionNames.update(datasetType.dimensions.names)
standardizedDatasetTypes.append(datasetType)
# If any matched dataset type is a component, just operate on
# its parent instead, because Registry doesn't know anything
# about what components exist, and here (unlike queryDatasets)
# we don't care about returning them.
parentDatasetTypeName, componentName = datasetType.nameAndComponent()
if componentName is not None:
datasetType = self.getDatasetType(parentDatasetTypeName)
standardizedDatasetTypes.add(datasetType)
# Preprocess collections expression in case the original included
# single-pass iterators (we'll want to use it multiple times
# below).
Expand Down Expand Up @@ -1225,6 +1242,7 @@ def queryDatasets(self, datasetType: Any, *,
where: Optional[str] = None,
deduplicate: bool = False,
expand: bool = True,
components: Optional[bool] = None,
**kwds) -> Iterator[DatasetRef]:
"""Query for and iterate over dataset references matching user-provided
criteria.
Expand Down Expand Up @@ -1266,6 +1284,13 @@ def queryDatasets(self, datasetType: Any, *,
expand : `bool`, optional
If `True` (default) attach `ExpandedDataCoordinate` instead of
minimal `DataCoordinate` base-class instances.
components : `bool`, optional
If `True`, apply all dataset expression patterns to component
dataset type names as well. If `False`, never apply patterns to
components. If `None` (default), apply patterns to components only
if their parent datasets were not matched by the expression.
Fully-specified component datasets (`str` or `DatasetType`
instances) are always included.
kwds
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``dataId``
Expand Down Expand Up @@ -1304,15 +1329,54 @@ def queryDatasets(self, datasetType: Any, *,
collections = CollectionQuery.fromExpression(collections)
# Standardize and expand the data ID provided as a constraint.
standardizedDataId = self.expandDataId(dataId, **kwds)
# If the datasetType passed isn't actually a DatasetType, expand it
# (it could be an expression that yields multiple DatasetTypes) and
# recurse.

# We can only query directly if given a non-component DatasetType
# instance. If we were given an expression or str or a component
# DatasetType instance, we'll populate this dict, recurse, and return.
# If we already have a non-component DatasetType, it will remain None
# and we'll run the query directly.
composition: Optional[
Dict[
DatasetType, # parent dataset type
List[Optional[str]] # component name, or None for parent
]
] = None
if not isinstance(datasetType, DatasetType):
for trueDatasetType in self.queryDatasetTypes(datasetType):
yield from self.queryDatasets(trueDatasetType, collections=collections,
dimensions=dimensions, dataId=standardizedDataId,
where=where, deduplicate=deduplicate)
# We were given a dataset type expression (which may be as simple
# as a str). Loop over all matching datasets, delegating handling
# of the `components` argument to queryDatasetTypes, as we populate
# the composition dict.
composition = defaultdict(list)
for trueDatasetType in self.queryDatasetTypes(datasetType, components=components):
parentName, componentName = trueDatasetType.nameAndComponent()
if componentName is not None:
parentDatasetType = self.getDatasetType(parentName)
composition.setdefault(parentDatasetType, []).append(componentName)
else:
composition.setdefault(trueDatasetType, []).append(None)
elif datasetType.isComponent():
# We were given a true DatasetType instance, but it's a component.
# the composition dict will have exactly one item.
parentName, componentName = datasetType.nameAndComponent()
parentDatasetType = self.getDatasetType(parentName)
composition = {parentDatasetType: [componentName]}
if composition is not None:
# We need to recurse. Do that once for each parent dataset type.
for parentDatasetType, componentNames in composition.items():
for parentRef in self.queryDatasets(parentDatasetType, collections=collections,
dimensions=dimensions, dataId=standardizedDataId,
where=where, deduplicate=deduplicate):
# Loop over components, yielding one for each one for each
# one requested.
for componentName in componentNames:
if componentName is None:
yield parentRef
else:
yield parentRef.makeComponentRef(componentName)
return
# If we get here, there's no need to recurse (or we are already
# recursing; there can only ever be one level of recursion).

# The full set of dimensions in the query is the combination of those
# needed for the DatasetType and those explicitly requested, if any.
requestedDimensionNames = set(datasetType.dimensions.names)
Expand Down
47 changes: 47 additions & 0 deletions python/lsst/daf/butler/registry/tests/_registry.py
Expand Up @@ -32,6 +32,7 @@

from ...core import (
DataCoordinate,
DatasetRef,
DatasetType,
DimensionGraph,
NamedValueSet,
Expand Down Expand Up @@ -457,6 +458,52 @@ def testDatasetTypeComponentQueries(self):
NamedValueSet(registry.queryDatasetTypes(re.compile(r".+bias\.wcs"), components=True)).names
)

def testComponentLookups(self):
"""Test searching for component datasets via their parents.
"""
registry = self.makeRegistry()
self.loadData(registry, "base.yaml")
self.loadData(registry, "datasets.yaml")
# Test getting the child dataset type (which does still exist in the
# Registry), and check for consistency with
# DatasetRef.makeComponentRef.
collection = "imported_g"
parentType = registry.getDatasetType("permabias")
childType = registry.getDatasetType("permabias.wcs")
parentRefResolved = registry.findDataset(parentType, collections=collection,
instrument="Cam1", detector=1)
self.assertIsInstance(parentRefResolved, DatasetRef)
self.assertEqual(childType, parentRefResolved.makeComponentRef("wcs").datasetType)
# Search for a single dataset with findDataset.
childRef1 = registry.findDataset("permabias.wcs", collections=collection,
dataId=parentRefResolved.dataId)
self.assertEqual(childRef1, parentRefResolved.makeComponentRef("wcs"))
# Search for detector data IDs constrained by component dataset
# existence with queryDimensions.
dataIds = set(registry.queryDimensions(
["detector"],
datasets=["permabias.wcs"],
collections=collection,
expand=False,
))
self.assertEqual(
dataIds,
{
DataCoordinate.standardize(instrument="Cam1", detector=d, graph=parentType.dimensions)
for d in (1, 2, 3)
}
)
# Search for multiple datasets of a single type with queryDatasets.
childRefs2 = set(registry.queryDatasets(
"permabias.wcs",
collections=collection,
expand=False,
))
self.assertEqual(
{ref.unresolved() for ref in childRefs2},
{DatasetRef(childType, dataId) for dataId in dataIds}
)

def testCollections(self):
"""Tests for registry methods that manage collections.
"""
Expand Down

0 comments on commit 97367b7

Please sign in to comment.