Merge pull request #914 from lsst/tickets/DM-41761

DM-41761: Add Butler.query property
lsst · Dec 7, 2023 · d41daf1 · d41daf1
2 parents 3243903 + 0e06fc3
commit d41daf1
Show file tree

Hide file tree

Showing 20 changed files with 3,524 additions and 212 deletions.
diff --git a/doc/changes/DM-41761.api.md b/doc/changes/DM-41761.api.md
@@ -0,0 +1,3 @@
+Added `Butler._query` context manager which will support building of the complex queries for data in Butler.
+For now `Butler._query` provides access to just three convenience methods similar to query methods in `Registry`.
+This new API should be considered experimental and potentially unstable, its use should be limited to downstream middleware code for now.
diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py
@@ -59,6 +59,8 @@
 from ._named import *
 from ._quantum import *
 from ._quantum_backed import *
+from ._query import *
+from ._query_results import *
 from ._storage_class import *
 from ._storage_class_delegate import *
 from ._timespan import *

diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -30,9 +30,9 @@
 __all__ = ["Butler"]
 
 from abc import abstractmethod
-from collections.abc import Collection, Iterable, Sequence
+from collections.abc import Collection, Iterable, Mapping, Sequence
 from contextlib import AbstractContextManager
-from typing import Any, TextIO
+from typing import TYPE_CHECKING, Any, TextIO
 
 from lsst.resources import ResourcePath, ResourcePathExpression
 from lsst.utils import doImportType
@@ -41,19 +41,25 @@
 from ._butler_config import ButlerConfig
 from ._butler_repo_index import ButlerRepoIndex
 from ._config import Config, ConfigSubset
-from ._dataset_existence import DatasetExistence
-from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
-from ._dataset_type import DatasetType
-from ._deferredDatasetHandle import DeferredDatasetHandle
-from ._file_dataset import FileDataset
 from ._limited_butler import LimitedButler
-from ._storage_class import StorageClass
-from ._timespan import Timespan
-from .datastore import DatasetRefURIs, Datastore
-from .dimensions import DataId, DimensionConfig
-from .registry import Registry, RegistryConfig, _RegistryFactory
+from .datastore import Datastore
+from .dimensions import DimensionConfig
+from .registry import RegistryConfig, _RegistryFactory
 from .repo_relocation import BUTLER_ROOT_TAG
-from .transfers import RepoExportContext
+
+if TYPE_CHECKING:
+    from ._dataset_existence import DatasetExistence
+    from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
+    from ._dataset_type import DatasetType
+    from ._deferredDatasetHandle import DeferredDatasetHandle
+    from ._file_dataset import FileDataset
+    from ._query import Query
+    from ._storage_class import StorageClass
+    from ._timespan import Timespan
+    from .datastore import DatasetRefURIs
+    from .dimensions import DataCoordinate, DataId, DimensionGroup, DimensionRecord
+    from .registry import CollectionArgType, Registry
+    from .transfers import RepoExportContext
 
 _LOG = getLogger(__name__)
 
@@ -1357,3 +1363,275 @@ def registry(self) -> Registry:
         will be replaced by equivalent `Butler` methods.
         """
         raise NotImplementedError()
+
+    @abstractmethod
+    def _query(self) -> AbstractContextManager[Query]:
+        """Context manager returning a `Query` object used for construction
+        and execution of complex queries.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _query_data_ids(
+        self,
+        dimensions: DimensionGroup | Iterable[str] | str,
+        *,
+        data_id: DataId | None = None,
+        where: str = "",
+        bind: Mapping[str, Any] | None = None,
+        expanded: bool = False,
+        order_by: Iterable[str] | str | None = None,
+        limit: int | None = None,
+        offset: int | None = None,
+        explain: bool = True,
+        **kwargs: Any,
+    ) -> list[DataCoordinate]:
+        """Query for data IDs matching user-provided criteria.
+
+        Parameters
+        ----------
+        dimensions : `DimensionGroup`, `str`, or \
+                `~collections.abc.Iterable` [`str`]
+            The dimensions of the data IDs to yield, as either `DimensionGroup`
+            instances or `str`.  Will be automatically expanded to a complete
+            `DimensionGroup`.
+        data_id : `dict` or `DataCoordinate`, optional
+            A data ID whose key-value pairs are used as equality constraints
+            in the query.
+        where : `str`, optional
+            A string expression similar to a SQL WHERE clause.  May involve
+            any column of a dimension table or (as a shortcut for the primary
+            key column of a dimension table) dimension name.  See
+            :ref:`daf_butler_dimension_expressions` for more information.
+        bind : `~collections.abc.Mapping`, optional
+            Mapping containing literal values that should be injected into the
+            ``where`` expression, keyed by the identifiers they replace.
+            Values of collection type can be expanded in some cases; see
+            :ref:`daf_butler_dimension_expressions_identifiers` for more
+            information.
+        expanded : `bool`, optional
+            If `True` (default is `False`) then returned data IDs will have
+            dimension records.
+        order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
+            Names of the columns/dimensions to use for ordering returned data
+            IDs. Column name can be prefixed with minus (``-``) to use
+            descending ordering.
+        limit : `int`, optional
+            Upper limit on the number of returned records.
+        offset : `int`, optional
+            The number of records to skip before returning at most ``limit``
+            records. If ``offset`` is specified then ``limit`` must be
+            specified as well.
+        explain : `bool`, optional
+            If `True` (default) then `EmptyQueryResultError` exception is
+            raised when resulting list is empty. The exception contains
+            non-empty list of strings explaining possible causes for empty
+            result.
+        **kwargs
+            Additional keyword arguments are forwarded to
+            `DataCoordinate.standardize` when processing the ``data_id``
+            argument (and may be used to provide a constraining data ID even
+            when the ``data_id`` argument is `None`).
+
+        Returns
+        -------
+        dataIds : `list` [`DataCoordinate`]
+            Data IDs matching the given query parameters.  These are always
+            guaranteed to identify all dimensions (`DataCoordinate.hasFull`
+            returns `True`).
+
+        Raises
+        ------
+        lsst.daf.butler.registry.DataIdError
+            Raised when ``data_id`` or keyword arguments specify unknown
+            dimensions or values, or when they contain inconsistent values.
+        lsst.daf.butler.registry.UserExpressionError
+            Raised when ``where`` expression is invalid.
+        lsst.daf.butler.EmptyQueryResultError
+            Raised when query generates empty result and ``explain`` is set to
+            `True`.
+        TypeError
+            Raised when the arguments are incompatible, e.g. ``offset`` is
+            specified, but ``limit`` is not.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _query_datasets(
+        self,
+        dataset_type: Any,
+        collections: CollectionArgType | None = None,
+        *,
+        find_first: bool = True,
+        data_id: DataId | None = None,
+        where: str = "",
+        bind: Mapping[str, Any] | None = None,
+        expanded: bool = False,
+        explain: bool = True,
+        **kwargs: Any,
+    ) -> list[DatasetRef]:
+        """Query for dataset references matching user-provided criteria.
+
+        Parameters
+        ----------
+        dataset_type : dataset type expression
+            An expression that fully or partially identifies the dataset types
+            to be queried.  Allowed types include `DatasetType`, `str`,
+            `re.Pattern`, and iterables thereof.  The special value ``...`` can
+            be used to query all dataset types.  See
+            :ref:`daf_butler_dataset_type_expressions` for more information.
+        collections : collection expression, optional
+            An expression that identifies the collections to search, such as a
+            `str` (for full matches or partial matches via globs), `re.Pattern`
+            (for partial matches), or iterable thereof.  ``...`` can be used to
+            search all collections (actually just all `~CollectionType.RUN`
+            collections, because this will still find all datasets).
+            If not provided, the default collections are used.  See
+            :ref:`daf_butler_collection_expressions` for more information.
+        find_first : `bool`, optional
+            If `True` (default), for each result data ID, only yield one
+            `DatasetRef` of each `DatasetType`, from the first collection in
+            which a dataset of that dataset type appears (according to the
+            order of ``collections`` passed in).  If `True`, ``collections``
+            must not contain regular expressions and may not be ``...``.
+        data_id : `dict` or `DataCoordinate`, optional
+            A data ID whose key-value pairs are used as equality constraints
+            in the query.
+        where : `str`, optional
+            A string expression similar to a SQL WHERE clause.  May involve
+            any column of a dimension table or (as a shortcut for the primary
+            key column of a dimension table) dimension name.  See
+            :ref:`daf_butler_dimension_expressions` for more information.
+        bind : `~collections.abc.Mapping`, optional
+            Mapping containing literal values that should be injected into the
+            ``where`` expression, keyed by the identifiers they replace.
+            Values of collection type can be expanded in some cases; see
+            :ref:`daf_butler_dimension_expressions_identifiers` for more
+            information.
+        expanded : `bool`, optional
+            If `True` (default is `False`) then returned data IDs will have
+            dimension records.
+        explain : `bool`, optional
+            If `True` (default) then `EmptyQueryResultError` exception is
+            raised when resulting list is empty. The exception contains
+            non-empty list of strings explaining possible causes for empty
+            result.
+        **kwargs
+            Additional keyword arguments are forwarded to
+            `DataCoordinate.standardize` when processing the ``data_id``
+            argument (and may be used to provide a constraining data ID even
+            when the ``data_id`` argument is `None`).
+
+        Returns
+        -------
+        refs : `.queries.DatasetQueryResults`
+            Dataset references matching the given query criteria.  Nested data
+            IDs are guaranteed to include values for all implied dimensions
+            (i.e. `DataCoordinate.hasFull` will return `True`), but will not
+            include dimension records (`DataCoordinate.hasRecords` will be
+            `False`) unless `~.queries.DatasetQueryResults.expanded` is
+            called on the result object (which returns a new one).
+
+        Raises
+        ------
+        lsst.daf.butler.registry.DatasetTypeExpressionError
+            Raised when ``dataset_type`` expression is invalid.
+        lsst.daf.butler.registry.DataIdError
+            Raised when ``data_id`` or keyword arguments specify unknown
+            dimensions or values, or when they contain inconsistent values.
+        lsst.daf.butler.registry.UserExpressionError
+            Raised when ``where`` expression is invalid.
+        lsst.daf.butler.EmptyQueryResultError
+            Raised when query generates empty result and ``explain`` is set to
+            `True`.
+        TypeError
+            Raised when the arguments are incompatible, such as when a
+            collection wildcard is passed when ``find_first`` is `True`, or
+            when ``collections`` is `None` and default butler collections are
+            not defined.
+
+        Notes
+        -----
+        When multiple dataset types are queried in a single call, the
+        results of this operation are equivalent to querying for each dataset
+        type separately in turn, and no information about the relationships
+        between datasets of different types is included.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def _query_dimension_records(
+        self,
+        element: str,
+        *,
+        data_id: DataId | None = None,
+        where: str = "",
+        bind: Mapping[str, Any] | None = None,
+        order_by: Iterable[str] | str | None = None,
+        limit: int | None = None,
+        offset: int | None = None,
+        explain: bool = True,
+        **kwargs: Any,
+    ) -> list[DimensionRecord]:
+        """Query for dimension information matching user-provided criteria.
+
+        Parameters
+        ----------
+        element : `str`
+            The name of a dimension element to obtain records for.
+        data_id : `dict` or `DataCoordinate`, optional
+            A data ID whose key-value pairs are used as equality constraints
+            in the query.
+        where : `str`, optional
+            A string expression similar to a SQL WHERE clause.  See
+            `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
+            information.
+        bind : `~collections.abc.Mapping`, optional
+            Mapping containing literal values that should be injected into the
+            ``where`` expression, keyed by the identifiers they replace.
+            Values of collection type can be expanded in some cases; see
+            :ref:`daf_butler_dimension_expressions_identifiers` for more
+            information.
+        order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
+            Names of the columns/dimensions to use for ordering returned data
+            IDs. Column name can be prefixed with minus (``-``) to use
+            descending ordering.
+        limit : `int`, optional
+            Upper limit on the number of returned records.
+        offset : `int`, optional
+            The number of records to skip before returning at most ``limit``
+            records. If ``offset`` is specified then ``limit`` must be
+            specified as well.
+        explain : `bool`, optional
+            If `True` (default) then `EmptyQueryResultError` exception is
+            raised when resulting list is empty. The exception contains
+            non-empty list of strings explaining possible causes for empty
+            result.
+        **kwargs
+            Additional keyword arguments are forwarded to
+            `DataCoordinate.standardize` when processing the ``data_id``
+            argument (and may be used to provide a constraining data ID even
+            when the ``data_id`` argument is `None`).
+
+        Returns
+        -------
+        records : `list`[`DimensionRecord`]
+            Dimension records matching the given query parameters.
+
+        Raises
+        ------
+        lsst.daf.butler.registry.DataIdError
+            Raised when ``data_id`` or keyword arguments specify unknown
+            dimensions or values, or when they contain inconsistent values.
+        lsst.daf.butler.registry.UserExpressionError
+            Raised when ``where`` expression is invalid.
+        lsst.daf.butler.EmptyQueryResultError
+            Raised when query generates empty result and ``explain`` is set to
+            `True`.
+        TypeError
+            Raised when the arguments are incompatible, such as when a
+            collection wildcard is passed when ``find_first`` is `True`, or
+            when ``collections`` is `None` and default butler collections are
+            not defined.
+        """
+        raise NotImplementedError()
diff --git a/python/lsst/daf/butler/_exceptions.py b/python/lsst/daf/butler/_exceptions.py
@@ -26,7 +26,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 """Specialized Butler exceptions."""
-__all__ = ("DatasetTypeNotSupportedError", "ValidationError")
+__all__ = ("DatasetTypeNotSupportedError", "EmptyQueryResultError", "ValidationError")
 
 
 class DatasetTypeNotSupportedError(RuntimeError):
@@ -43,3 +43,21 @@ class ValidationError(RuntimeError):
     """Some sort of validation error has occurred."""
 
     pass
+
+
+class EmptyQueryResultError(Exception):
+    """Exception raised when query methods return an empty result and `explain`
+    flag is set.
+
+    Attributes
+    ----------
+    reasons : `list` [`str`]
+        List of possible reasons for an empty query result.
+    """
+
+    def __init__(self, reasons: list[str]):
+        self.reasons = reasons
+
+    def __str__(self) -> str:
+        # There may be multiple reasons, format them into multiple lines.
+        return "Possible reasons for empty result:\n" + "\n".join(self.reasons)