Skip to content

Commit

Permalink
Merge pull request #914 from lsst/tickets/DM-41761
Browse files Browse the repository at this point in the history
DM-41761: Add Butler.query property
  • Loading branch information
andy-slac committed Dec 7, 2023
2 parents 3243903 + 0e06fc3 commit d41daf1
Show file tree
Hide file tree
Showing 20 changed files with 3,524 additions and 212 deletions.
3 changes: 3 additions & 0 deletions doc/changes/DM-41761.api.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Added `Butler._query` context manager which will support building of the complex queries for data in Butler.
For now `Butler._query` provides access to just three convenience methods similar to query methods in `Registry`.
This new API should be considered experimental and potentially unstable, its use should be limited to downstream middleware code for now.
2 changes: 2 additions & 0 deletions python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
from ._named import *
from ._quantum import *
from ._quantum_backed import *
from ._query import *
from ._query_results import *
from ._storage_class import *
from ._storage_class_delegate import *
from ._timespan import *
Expand Down
304 changes: 291 additions & 13 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
__all__ = ["Butler"]

from abc import abstractmethod
from collections.abc import Collection, Iterable, Sequence
from collections.abc import Collection, Iterable, Mapping, Sequence
from contextlib import AbstractContextManager
from typing import Any, TextIO
from typing import TYPE_CHECKING, Any, TextIO

from lsst.resources import ResourcePath, ResourcePathExpression
from lsst.utils import doImportType
Expand All @@ -41,19 +41,25 @@
from ._butler_config import ButlerConfig
from ._butler_repo_index import ButlerRepoIndex
from ._config import Config, ConfigSubset
from ._dataset_existence import DatasetExistence
from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._file_dataset import FileDataset
from ._limited_butler import LimitedButler
from ._storage_class import StorageClass
from ._timespan import Timespan
from .datastore import DatasetRefURIs, Datastore
from .dimensions import DataId, DimensionConfig
from .registry import Registry, RegistryConfig, _RegistryFactory
from .datastore import Datastore
from .dimensions import DimensionConfig
from .registry import RegistryConfig, _RegistryFactory
from .repo_relocation import BUTLER_ROOT_TAG
from .transfers import RepoExportContext

if TYPE_CHECKING:
from ._dataset_existence import DatasetExistence
from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._file_dataset import FileDataset
from ._query import Query
from ._storage_class import StorageClass
from ._timespan import Timespan
from .datastore import DatasetRefURIs
from .dimensions import DataCoordinate, DataId, DimensionGroup, DimensionRecord
from .registry import CollectionArgType, Registry
from .transfers import RepoExportContext

_LOG = getLogger(__name__)

Expand Down Expand Up @@ -1357,3 +1363,275 @@ def registry(self) -> Registry:
will be replaced by equivalent `Butler` methods.
"""
raise NotImplementedError()

@abstractmethod
def _query(self) -> AbstractContextManager[Query]:
"""Context manager returning a `Query` object used for construction
and execution of complex queries.
"""
raise NotImplementedError()

@abstractmethod
def _query_data_ids(
self,
dimensions: DimensionGroup | Iterable[str] | str,
*,
data_id: DataId | None = None,
where: str = "",
bind: Mapping[str, Any] | None = None,
expanded: bool = False,
order_by: Iterable[str] | str | None = None,
limit: int | None = None,
offset: int | None = None,
explain: bool = True,
**kwargs: Any,
) -> list[DataCoordinate]:
"""Query for data IDs matching user-provided criteria.
Parameters
----------
dimensions : `DimensionGroup`, `str`, or \
`~collections.abc.Iterable` [`str`]
The dimensions of the data IDs to yield, as either `DimensionGroup`
instances or `str`. Will be automatically expanded to a complete
`DimensionGroup`.
data_id : `dict` or `DataCoordinate`, optional
A data ID whose key-value pairs are used as equality constraints
in the query.
where : `str`, optional
A string expression similar to a SQL WHERE clause. May involve
any column of a dimension table or (as a shortcut for the primary
key column of a dimension table) dimension name. See
:ref:`daf_butler_dimension_expressions` for more information.
bind : `~collections.abc.Mapping`, optional
Mapping containing literal values that should be injected into the
``where`` expression, keyed by the identifiers they replace.
Values of collection type can be expanded in some cases; see
:ref:`daf_butler_dimension_expressions_identifiers` for more
information.
expanded : `bool`, optional
If `True` (default is `False`) then returned data IDs will have
dimension records.
order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
Names of the columns/dimensions to use for ordering returned data
IDs. Column name can be prefixed with minus (``-``) to use
descending ordering.
limit : `int`, optional
Upper limit on the number of returned records.
offset : `int`, optional
The number of records to skip before returning at most ``limit``
records. If ``offset`` is specified then ``limit`` must be
specified as well.
explain : `bool`, optional
If `True` (default) then `EmptyQueryResultError` exception is
raised when resulting list is empty. The exception contains
non-empty list of strings explaining possible causes for empty
result.
**kwargs
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``data_id``
argument (and may be used to provide a constraining data ID even
when the ``data_id`` argument is `None`).
Returns
-------
dataIds : `list` [`DataCoordinate`]
Data IDs matching the given query parameters. These are always
guaranteed to identify all dimensions (`DataCoordinate.hasFull`
returns `True`).
Raises
------
lsst.daf.butler.registry.DataIdError
Raised when ``data_id`` or keyword arguments specify unknown
dimensions or values, or when they contain inconsistent values.
lsst.daf.butler.registry.UserExpressionError
Raised when ``where`` expression is invalid.
lsst.daf.butler.EmptyQueryResultError
Raised when query generates empty result and ``explain`` is set to
`True`.
TypeError
Raised when the arguments are incompatible, e.g. ``offset`` is
specified, but ``limit`` is not.
"""
raise NotImplementedError()

@abstractmethod
def _query_datasets(
self,
dataset_type: Any,
collections: CollectionArgType | None = None,
*,
find_first: bool = True,
data_id: DataId | None = None,
where: str = "",
bind: Mapping[str, Any] | None = None,
expanded: bool = False,
explain: bool = True,
**kwargs: Any,
) -> list[DatasetRef]:
"""Query for dataset references matching user-provided criteria.
Parameters
----------
dataset_type : dataset type expression
An expression that fully or partially identifies the dataset types
to be queried. Allowed types include `DatasetType`, `str`,
`re.Pattern`, and iterables thereof. The special value ``...`` can
be used to query all dataset types. See
:ref:`daf_butler_dataset_type_expressions` for more information.
collections : collection expression, optional
An expression that identifies the collections to search, such as a
`str` (for full matches or partial matches via globs), `re.Pattern`
(for partial matches), or iterable thereof. ``...`` can be used to
search all collections (actually just all `~CollectionType.RUN`
collections, because this will still find all datasets).
If not provided, the default collections are used. See
:ref:`daf_butler_collection_expressions` for more information.
find_first : `bool`, optional
If `True` (default), for each result data ID, only yield one
`DatasetRef` of each `DatasetType`, from the first collection in
which a dataset of that dataset type appears (according to the
order of ``collections`` passed in). If `True`, ``collections``
must not contain regular expressions and may not be ``...``.
data_id : `dict` or `DataCoordinate`, optional
A data ID whose key-value pairs are used as equality constraints
in the query.
where : `str`, optional
A string expression similar to a SQL WHERE clause. May involve
any column of a dimension table or (as a shortcut for the primary
key column of a dimension table) dimension name. See
:ref:`daf_butler_dimension_expressions` for more information.
bind : `~collections.abc.Mapping`, optional
Mapping containing literal values that should be injected into the
``where`` expression, keyed by the identifiers they replace.
Values of collection type can be expanded in some cases; see
:ref:`daf_butler_dimension_expressions_identifiers` for more
information.
expanded : `bool`, optional
If `True` (default is `False`) then returned data IDs will have
dimension records.
explain : `bool`, optional
If `True` (default) then `EmptyQueryResultError` exception is
raised when resulting list is empty. The exception contains
non-empty list of strings explaining possible causes for empty
result.
**kwargs
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``data_id``
argument (and may be used to provide a constraining data ID even
when the ``data_id`` argument is `None`).
Returns
-------
refs : `.queries.DatasetQueryResults`
Dataset references matching the given query criteria. Nested data
IDs are guaranteed to include values for all implied dimensions
(i.e. `DataCoordinate.hasFull` will return `True`), but will not
include dimension records (`DataCoordinate.hasRecords` will be
`False`) unless `~.queries.DatasetQueryResults.expanded` is
called on the result object (which returns a new one).
Raises
------
lsst.daf.butler.registry.DatasetTypeExpressionError
Raised when ``dataset_type`` expression is invalid.
lsst.daf.butler.registry.DataIdError
Raised when ``data_id`` or keyword arguments specify unknown
dimensions or values, or when they contain inconsistent values.
lsst.daf.butler.registry.UserExpressionError
Raised when ``where`` expression is invalid.
lsst.daf.butler.EmptyQueryResultError
Raised when query generates empty result and ``explain`` is set to
`True`.
TypeError
Raised when the arguments are incompatible, such as when a
collection wildcard is passed when ``find_first`` is `True`, or
when ``collections`` is `None` and default butler collections are
not defined.
Notes
-----
When multiple dataset types are queried in a single call, the
results of this operation are equivalent to querying for each dataset
type separately in turn, and no information about the relationships
between datasets of different types is included.
"""
raise NotImplementedError()

@abstractmethod
def _query_dimension_records(
self,
element: str,
*,
data_id: DataId | None = None,
where: str = "",
bind: Mapping[str, Any] | None = None,
order_by: Iterable[str] | str | None = None,
limit: int | None = None,
offset: int | None = None,
explain: bool = True,
**kwargs: Any,
) -> list[DimensionRecord]:
"""Query for dimension information matching user-provided criteria.
Parameters
----------
element : `str`
The name of a dimension element to obtain records for.
data_id : `dict` or `DataCoordinate`, optional
A data ID whose key-value pairs are used as equality constraints
in the query.
where : `str`, optional
A string expression similar to a SQL WHERE clause. See
`queryDataIds` and :ref:`daf_butler_dimension_expressions` for more
information.
bind : `~collections.abc.Mapping`, optional
Mapping containing literal values that should be injected into the
``where`` expression, keyed by the identifiers they replace.
Values of collection type can be expanded in some cases; see
:ref:`daf_butler_dimension_expressions_identifiers` for more
information.
order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
Names of the columns/dimensions to use for ordering returned data
IDs. Column name can be prefixed with minus (``-``) to use
descending ordering.
limit : `int`, optional
Upper limit on the number of returned records.
offset : `int`, optional
The number of records to skip before returning at most ``limit``
records. If ``offset`` is specified then ``limit`` must be
specified as well.
explain : `bool`, optional
If `True` (default) then `EmptyQueryResultError` exception is
raised when resulting list is empty. The exception contains
non-empty list of strings explaining possible causes for empty
result.
**kwargs
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``data_id``
argument (and may be used to provide a constraining data ID even
when the ``data_id`` argument is `None`).
Returns
-------
records : `list`[`DimensionRecord`]
Dimension records matching the given query parameters.
Raises
------
lsst.daf.butler.registry.DataIdError
Raised when ``data_id`` or keyword arguments specify unknown
dimensions or values, or when they contain inconsistent values.
lsst.daf.butler.registry.UserExpressionError
Raised when ``where`` expression is invalid.
lsst.daf.butler.EmptyQueryResultError
Raised when query generates empty result and ``explain`` is set to
`True`.
TypeError
Raised when the arguments are incompatible, such as when a
collection wildcard is passed when ``find_first`` is `True`, or
when ``collections`` is `None` and default butler collections are
not defined.
"""
raise NotImplementedError()
20 changes: 19 additions & 1 deletion python/lsst/daf/butler/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Specialized Butler exceptions."""
__all__ = ("DatasetTypeNotSupportedError", "ValidationError")
__all__ = ("DatasetTypeNotSupportedError", "EmptyQueryResultError", "ValidationError")


class DatasetTypeNotSupportedError(RuntimeError):
Expand All @@ -43,3 +43,21 @@ class ValidationError(RuntimeError):
"""Some sort of validation error has occurred."""

pass


class EmptyQueryResultError(Exception):
"""Exception raised when query methods return an empty result and `explain`
flag is set.
Attributes
----------
reasons : `list` [`str`]
List of possible reasons for an empty query result.
"""

def __init__(self, reasons: list[str]):
self.reasons = reasons

def __str__(self) -> str:
# There may be multiple reasons, format them into multiple lines.
return "Possible reasons for empty result:\n" + "\n".join(self.reasons)

0 comments on commit d41daf1

Please sign in to comment.