Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-43146: query system updates from the DM-42740 review #971

Merged
merged 7 commits into from
Mar 6, 2024
9 changes: 1 addition & 8 deletions python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,7 @@
from .progress import Progress

# Only import the main public symbols from queries
from .queries import (
ChainedDatasetQueryResults,
DataCoordinateQueryResults,
DatasetQueryResults,
DimensionRecordQueryResults,
Query,
SingleTypeDatasetQueryResults,
)
from .queries import DataCoordinateQueryResults, DatasetRefQueryResults, DimensionRecordQueryResults, Query

# Do not import or lift symbols from 'server' or 'server_models'.
# Import the registry subpackage directly for other symbols.
Expand Down
13 changes: 4 additions & 9 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from abc import abstractmethod
from collections.abc import Collection, Iterable, Mapping, Sequence
from contextlib import AbstractContextManager
from types import EllipsisType
from typing import TYPE_CHECKING, Any, TextIO

from lsst.resources import ResourcePath, ResourcePathExpression
Expand Down Expand Up @@ -1531,7 +1530,7 @@ def _query_data_ids(

def _query_datasets(
self,
dataset_type: str | Iterable[str] | DatasetType | Iterable[DatasetType] | EllipsisType,
dataset_type: str | DatasetType,
collections: str | Iterable[str] | None = None,
*,
find_first: bool = True,
Expand All @@ -1546,12 +1545,8 @@ def _query_datasets(

Parameters
----------
dataset_type : dataset type expression
An expression that fully or partially identifies the dataset types
to be queried. Allowed types include `DatasetType`, `str`, and
iterables thereof. The special value ``...`` can be used to query
all dataset types. See :ref:`daf_butler_dataset_type_expressions`
for more information.
dataset_type : `str` or `DatasetType`
Dataset type object or name to search for.
collections : collection expression, optional
A collection name or iterable of collection names to search. If not
provided, the default collections are used. See
Expand Down Expand Up @@ -1592,7 +1587,7 @@ def _query_datasets(

Returns
-------
refs : `.queries.DatasetQueryResults`
refs : `.queries.DatasetRefQueryResults`
Dataset references matching the given query criteria. Nested data
IDs are guaranteed to include values for all implied dimensions
(i.e. `DataCoordinate.hasFull` will return `True`).
Expand Down
7 changes: 7 additions & 0 deletions python/lsst/daf/butler/column_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"RegionColumnSpec",
"TimespanColumnSpec",
"ColumnType",
"COLLECTION_NAME_MAX_LENGTH",
)

import textwrap
Expand All @@ -58,6 +59,12 @@
]


COLLECTION_NAME_MAX_LENGTH = 64
# TODO: DM-42541 would bee a good opportunity to move this constant to a
# better home; this file is the least-bad home I can think of for now. Note
# that actually changing the value is a (minor) schema change.


class _BaseColumnSpec(pydantic.BaseModel, ABC):
"""Base class for descriptions of table columns."""

Expand Down
78 changes: 24 additions & 54 deletions python/lsst/daf/butler/queries/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

from __future__ import annotations

__all__ = ("QueryBase", "HomogeneousQueryBase", "CountableQueryBase", "QueryResultsBase")
__all__ = ("QueryBase", "QueryResultsBase")

from abc import ABC, abstractmethod
from collections.abc import Iterable, Mapping, Set
Expand All @@ -45,9 +45,24 @@ class QueryBase(ABC):

This class should rarely be referenced directly; it is public only because
it provides public methods to its subclasses.

Parameters
----------
driver : `QueryDriver`
Implementation object that knows how to actually execute queries.
tree : `QueryTree`
Description of the query as a tree of joins and column expressions.
"""

@abstractmethod
def __init__(self, driver: QueryDriver, tree: QueryTree):
self._driver = driver
self._tree = tree

@property
def dimensions(self) -> DimensionGroup:
"""All dimensions included in the query's columns."""
return self._tree.dimensions

def any(self, *, execute: bool = True, exact: bool = True) -> bool:
"""Test whether the query would return any rows.

Expand All @@ -69,9 +84,8 @@ def any(self, *, execute: bool = True, exact: bool = True) -> bool:
`True` if the query would (or might, depending on arguments) yield
result rows. `False` if it definitely would not.
"""
raise NotImplementedError()
return self._driver.any(self._tree, execute=execute, exact=exact)

@abstractmethod
def explain_no_results(self, execute: bool = True) -> Iterable[str]:
"""Return human-readable messages that may help explain why the query
yields no results.
Expand All @@ -89,14 +103,14 @@ def explain_no_results(self, execute: bool = True) -> Iterable[str]:
String messages that describe reasons the query might not yield any
results.
"""
raise NotImplementedError()
return self._driver.explain_no_results(self._tree, execute=execute)

@abstractmethod
def where(
self,
*args: str | Predicate | DataId,
bind: Mapping[str, Any] | None = None,
**kwargs: Any,
**kwargs: int | str,
) -> Self:
"""Return a query with a boolean-expression filter on its rows.

Expand Down Expand Up @@ -137,46 +151,8 @@ def where(
raise NotImplementedError()


class HomogeneousQueryBase(QueryBase):
"""Common base class for `Query` and query result classes that are
iterables with consistent dimensions throughout.

This class should rarely be referenced directly; it is public only because
it provides public methods to its subclasses.

Parameters
----------
driver : `QueryDriver`
Implementation object that knows how to actually execute queries.
tree : `QueryTree`
Description of the query as a tree of joins and column expressions.
"""

def __init__(self, driver: QueryDriver, tree: QueryTree):
self._driver = driver
self._tree = tree

@property
def dimensions(self) -> DimensionGroup:
"""All dimensions included in the query's columns."""
return self._tree.dimensions

def any(self, *, execute: bool = True, exact: bool = True) -> bool:
# Docstring inherited.
return self._driver.any(self._tree, execute=execute, exact=exact)

def explain_no_results(self, execute: bool = True) -> Iterable[str]:
# Docstring inherited.
return self._driver.explain_no_results(self._tree, execute=execute)


class CountableQueryBase(QueryBase):
"""Common base class for query result objects for which the number of
result rows is a well-defined concept.

This class should rarely be referenced directly; it is public only because
it provides public methods to its subclasses.
"""
class QueryResultsBase(QueryBase):
"""Common base class for query result objects with countable rows."""

@abstractmethod
def count(self, *, exact: bool = True, discard: bool = False) -> int:
Expand Down Expand Up @@ -204,12 +180,6 @@ def count(self, *, exact: bool = True, discard: bool = False) -> int:
"""
raise NotImplementedError()


class QueryResultsBase(HomogeneousQueryBase, CountableQueryBase):
"""Common base class for query result objects with homogeneous dimensions
and countable rows.
"""

def order_by(self, *args: str | OrderExpression | ExpressionProxy) -> Self:
"""Return a new query that yields ordered results.

Expand Down Expand Up @@ -262,7 +232,7 @@ def where(
self,
*args: str | Predicate | DataId,
bind: Mapping[str, Any] | None = None,
**kwargs: Any,
**kwargs: int | str,
) -> Self:
# Docstring inherited.
return self._copy(
Expand All @@ -281,7 +251,7 @@ def _get_datasets(self) -> Set[str]:
def _copy(self, tree: QueryTree, **kwargs: Any) -> Self:
"""Return a modified copy of ``self``.

Implementations should validate odifications, not assume they are
Implementations should validate modifications, not assume they are
correct.
"""
raise NotImplementedError()
140 changes: 13 additions & 127 deletions python/lsst/daf/butler/queries/_dataset_query_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,60 +27,24 @@

from __future__ import annotations

__all__ = (
"DatasetQueryResults",
"ChainedDatasetQueryResults",
"SingleTypeDatasetQueryResults",
)

import itertools
from abc import abstractmethod
from collections.abc import Iterable, Iterator, Mapping
__all__ = ("DatasetRefQueryResults",)

from collections.abc import Iterator
from typing import TYPE_CHECKING, Any, final

from .._dataset_ref import DatasetRef
from .._dataset_type import DatasetType
from ..dimensions import DataId
from ._base import CountableQueryBase, QueryResultsBase
from ._base import QueryResultsBase
from .driver import QueryDriver
from .result_specs import DataCoordinateResultSpec, DatasetRefResultSpec
from .tree import Predicate, QueryTree
from .tree import QueryTree

if TYPE_CHECKING:
from ._data_coordinate_query_results import DataCoordinateQueryResults


class DatasetQueryResults(CountableQueryBase, Iterable[DatasetRef]):
"""A query for `DatasetRef` results."""

@abstractmethod
def by_dataset_type(self) -> Iterator[SingleTypeDatasetQueryResults]:
"""Group results by dataset type.

Returns
-------
iter : `~collections.abc.Iterator` [ `SingleTypeDatasetQueryResults` ]
An iterator over `DatasetQueryResults` instances that are each
responsible for a single dataset type.
"""
raise NotImplementedError()

@property
@abstractmethod
def has_dimension_records(self) -> bool:
"""Whether all data IDs in this iterable contain dimension records."""
raise NotImplementedError()

@abstractmethod
def with_dimension_records(self) -> DatasetQueryResults:
"""Return a results object for which `has_dimension_records` is
`True`.
"""
raise NotImplementedError()


@final
class SingleTypeDatasetQueryResults(DatasetQueryResults, QueryResultsBase):
class DatasetRefQueryResults(QueryResultsBase):
"""A query for `DatasetRef` results with a single dataset type.

Parameters
Expand Down Expand Up @@ -134,103 +98,25 @@ def data_ids(self) -> DataCoordinateQueryResults:

@property
def has_dimension_records(self) -> bool:
# Docstring inherited.
"""Whether all data IDs in this iterable contain dimension records."""
return self._spec.include_dimension_records

def with_dimension_records(self) -> SingleTypeDatasetQueryResults:
# Docstring inherited.
def with_dimension_records(self) -> DatasetRefQueryResults:
"""Return a results object for which `has_dimension_records` is
`True`.
"""
if self.has_dimension_records:
return self
return self._copy(tree=self._tree, include_dimension_records=True)

def by_dataset_type(self) -> Iterator[SingleTypeDatasetQueryResults]:
# Docstring inherited.
return iter((self,))

def count(self, *, exact: bool = True, discard: bool = False) -> int:
# Docstring inherited.
return self._driver.count(self._tree, self._spec, exact=exact, discard=discard)

def _copy(self, tree: QueryTree, **kwargs: Any) -> SingleTypeDatasetQueryResults:
def _copy(self, tree: QueryTree, **kwargs: Any) -> DatasetRefQueryResults:
# Docstring inherited.
return SingleTypeDatasetQueryResults(self._driver, tree, self._spec.model_copy(update=kwargs))
return DatasetRefQueryResults(self._driver, tree, self._spec.model_copy(update=kwargs))

def _get_datasets(self) -> frozenset[str]:
# Docstring inherited.
return frozenset({self.dataset_type.name})


@final
class ChainedDatasetQueryResults(DatasetQueryResults):
"""A query for `DatasetRef` results with multiple dataset types.

Parameters
----------
by_dataset_type : `tuple` [ `SingleTypeDatasetQueryResults` ]
Tuple of single-dataset-type query result objects to combine.

Notes
-----
This class should never be constructed directly by users; use
`Query.datasets` instead.
"""

def __init__(self, by_dataset_type: tuple[SingleTypeDatasetQueryResults, ...]):
self._by_dataset_type = by_dataset_type

def __iter__(self) -> Iterator[DatasetRef]:
return itertools.chain.from_iterable(self._by_dataset_type)

def by_dataset_type(self) -> Iterator[SingleTypeDatasetQueryResults]:
# Docstring inherited.
return iter(self._by_dataset_type)

@property
def has_dimension_records(self) -> bool:
# Docstring inherited.
return all(single_type_results.has_dimension_records for single_type_results in self._by_dataset_type)

def with_dimension_records(self) -> ChainedDatasetQueryResults:
# Docstring inherited.
return ChainedDatasetQueryResults(
tuple(
[
single_type_results.with_dimension_records()
for single_type_results in self._by_dataset_type
]
)
)

def any(self, *, execute: bool = True, exact: bool = True) -> bool:
# Docstring inherited.
return any(
single_type_results.any(execute=execute, exact=exact)
for single_type_results in self._by_dataset_type
)

def explain_no_results(self, execute: bool = True) -> Iterable[str]:
# Docstring inherited.
messages: list[str] = []
for single_type_results in self._by_dataset_type:
messages.extend(single_type_results.explain_no_results(execute=execute))
return messages

def count(self, *, exact: bool = True, discard: bool = False) -> int:
# Docstring inherited.
return sum(
single_type_results.count(exact=exact, discard=discard)
for single_type_results in self._by_dataset_type
)

def where(
self, *args: DataId | str | Predicate, bind: Mapping[str, Any] | None = None, **kwargs: Any
) -> ChainedDatasetQueryResults:
# Docstring inherited.
return ChainedDatasetQueryResults(
tuple(
[
single_type_results.where(*args, bind=bind, **kwargs)
for single_type_results in self._by_dataset_type
]
)
)