Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-43526: Implement registry collection methods for RemoteButler #983

Merged
merged 6 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 11 additions & 1 deletion python/lsst/daf/butler/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,11 @@
"DatasetTypeNotSupportedError",
"EmptyQueryResultError",
"MissingDatasetTypeError",
"MissingCollectionError",
"ValidationError",
)

from ._exceptions_legacy import DataIdError, DatasetTypeError
from ._exceptions_legacy import CollectionError, DataIdError, DatasetTypeError


class ButlerUserError(Exception):
Expand Down Expand Up @@ -92,6 +93,14 @@ class DimensionNameError(KeyError, DataIdError, ButlerUserError):
error_type = "dimension_name"


class MissingCollectionError(CollectionError, ButlerUserError):
"""Exception raised when an operation attempts to use a collection that
does not exist.
"""

error_type = "missing_collection"


class MissingDatasetTypeError(DatasetTypeError, KeyError, ButlerUserError):
"""Exception raised when a dataset type does not exist."""

Expand Down Expand Up @@ -145,6 +154,7 @@ class UnknownButlerUserError(ButlerUserError):
CalibrationLookupError,
DimensionNameError,
DatasetNotFoundError,
MissingCollectionError,
MissingDatasetTypeError,
UnknownButlerUserError,
)
Expand Down
4 changes: 4 additions & 0 deletions python/lsst/daf/butler/_exceptions_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,7 @@ class DataIdError(RegistryError):

class DatasetTypeError(RegistryError):
"""Exception raised for problems with dataset types."""


class CollectionError(RegistryError):
"""Exception raised for collection-related errors."""
4 changes: 2 additions & 2 deletions python/lsst/daf/butler/registry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@

# Re-export some top-level exception types for backwards compatibility -- these
# used to be part of registry.
from .._exceptions import DimensionNameError, MissingDatasetTypeError
from .._exceptions_legacy import DataIdError, DatasetTypeError, RegistryError
from .._exceptions import DimensionNameError, MissingCollectionError, MissingDatasetTypeError
from .._exceptions_legacy import CollectionError, DataIdError, DatasetTypeError, RegistryError

# Registry imports.
from . import interfaces, managers, queries, wildcards
Expand Down
38 changes: 33 additions & 5 deletions python/lsst/daf/butler/registry/_collection_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,18 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import annotations

__all__ = ("CollectionSummary",)
__all__ = ("CollectionSummary", "SerializedCollectionSummary")

import dataclasses
from collections.abc import Generator, Iterable, Mapping, Set
from typing import cast

import pydantic

from .._dataset_ref import DatasetRef
from .._dataset_type import DatasetType
from .._dataset_type import DatasetType, SerializedDatasetType
from .._named import NamedValueSet
from ..dimensions import DataCoordinate
from ..dimensions import DataCoordinate, DimensionUniverse


@dataclasses.dataclass
Expand All @@ -52,8 +54,7 @@ def copy(self) -> CollectionSummary:
at all.
"""
return CollectionSummary(
dataset_types=self.dataset_types.copy(),
governors={k: v.copy() for k, v in self.governors.items()},
dataset_types=self.dataset_types.copy(), governors=_copy_governors(self.governors)
)

def add_datasets_generator(self, refs: Iterable[DatasetRef]) -> Generator[DatasetRef, None, None]:
Expand Down Expand Up @@ -223,6 +224,21 @@ def is_compatible_with(
return False
return True

def to_simple(self) -> SerializedCollectionSummary:
return SerializedCollectionSummary(
dataset_types=[x.to_simple() for x in self.dataset_types],
governors=_copy_governors(self.governors),
)

@staticmethod
def from_simple(simple: SerializedCollectionSummary, universe: DimensionUniverse) -> CollectionSummary:
summary = CollectionSummary()
summary.dataset_types = NamedValueSet(
[DatasetType.from_simple(x, universe) for x in simple.dataset_types]
)
summary.governors = _copy_governors(simple.governors)
return summary

dataset_types: NamedValueSet[DatasetType] = dataclasses.field(default_factory=NamedValueSet)
"""Dataset types that may be present in the collection
(`NamedValueSet` [ `DatasetType` ]).
Expand All @@ -241,3 +257,15 @@ def is_compatible_with(
IDs, and hence the values of those data IDs are unconstrained by this
collection in the query.
"""


def _copy_governors(governors: dict[str, set[str]]) -> dict[str, set[str]]:
"""Make an independent copy of the 'governors' data structure."""
return {k: v.copy() for k, v in governors.items()}


class SerializedCollectionSummary(pydantic.BaseModel):
"""Serialized version of CollectionSummary."""

dataset_types: list[SerializedDatasetType]
governors: dict[str, set[str]]
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/registry/_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@

from lsst.utils.classes import immutable

from .._exceptions import MissingCollectionError
from ..dimensions import DataCoordinate
from ._collection_summary import CollectionSummary
from ._exceptions import MissingCollectionError
from .wildcards import CollectionWildcard

if TYPE_CHECKING:
Expand Down
14 changes: 1 addition & 13 deletions python/lsst/daf/butler/registry/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,12 @@

__all__ = (
"ArgumentError",
"CollectionError",
"CollectionExpressionError",
"CollectionTypeError",
"ConflictingDefinitionError",
"DataIdValueError",
"DatasetTypeExpressionError",
"InconsistentDataIdError",
"MissingCollectionError",
"MissingSpatialOverlapError",
"NoDefaultCollectionError",
"OrphanedRecordError",
Expand All @@ -45,7 +43,7 @@
"UserExpressionSyntaxError",
)

from .._exceptions_legacy import DataIdError, RegistryError
from .._exceptions_legacy import CollectionError, DataIdError, RegistryError


class ArgumentError(RegistryError):
Expand All @@ -66,10 +64,6 @@ class InconsistentDataIdError(DataIdError):
"""


class CollectionError(RegistryError):
"""Exception raised for collection-related errors."""


class CollectionTypeError(CollectionError):
"""Exception raised when type of a collection is incorrect."""

Expand All @@ -78,12 +72,6 @@ class CollectionExpressionError(CollectionError):
"""Exception raised for an incorrect collection expression."""


class MissingCollectionError(CollectionError):
"""Exception raised when an operation attempts to use a collection that
does not exist.
"""


class NoDefaultCollectionError(CollectionError):
"""Exception raised when a collection is needed, but collection argument
is not provided and default collection is not defined in registry.
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/registry/collections/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@

import sqlalchemy

from ..._exceptions import MissingCollectionError
from ...timespan_database_representation import TimespanDatabaseRepresentation
from .._collection_type import CollectionType
from .._exceptions import MissingCollectionError
from ..interfaces import ChainedCollectionRecord, CollectionManager, CollectionRecord, RunRecord, VersionTuple
from ..wildcards import CollectionWildcard

Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/registry/sql_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ def getCollectionType(self, name: str) -> CollectionType:
"""
return self._managers.collections.find(name).type

def _get_collection_record(self, name: str) -> CollectionRecord:
def get_collection_record(self, name: str) -> CollectionRecord:
"""Return the record for this collection.

Parameters
Expand Down
27 changes: 22 additions & 5 deletions python/lsst/daf/butler/registry/tests/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
from ..._dataset_association import DatasetAssociation
from ..._dataset_ref import DatasetIdFactory, DatasetIdGenEnum, DatasetRef
from ..._dataset_type import DatasetType
from ..._exceptions import MissingDatasetTypeError
from ..._exceptions import MissingCollectionError, MissingDatasetTypeError
from ..._exceptions_legacy import DatasetTypeError
from ..._storage_class import StorageClass
from ..._timespan import Timespan
Expand All @@ -72,7 +72,6 @@
DataIdValueError,
DatasetTypeExpressionError,
InconsistentDataIdError,
MissingCollectionError,
NoDefaultCollectionError,
OrphanedRecordError,
)
Expand Down Expand Up @@ -100,6 +99,10 @@ class RegistryTests(ABC):
in default configuration (`str` or `dict`).
"""

supportsCollectionRegex: bool = True
"""True if the registry class being tested supports regex searches for
collections."""

@classmethod
@abstractmethod
def getDataDir(cls) -> str:
Expand Down Expand Up @@ -770,16 +773,30 @@ def testCollections(self):
registry.setCollectionChain(chain2, [run2, chain1])
self.assertEqual(registry.getCollectionParentChains(chain1), {chain2})
self.assertEqual(registry.getCollectionParentChains(run2), {chain1, chain2})
# Query for collections matching a regex.

if self.supportsCollectionRegex:
# Query for collections matching a regex.
self.assertCountEqual(
list(registry.queryCollections(re.compile("imported_."), flattenChains=False)),
["imported_r", "imported_g"],
)
# Query for collections matching a regex or an explicit str.
self.assertCountEqual(
list(registry.queryCollections([re.compile("imported_."), "chain1"], flattenChains=False)),
["imported_r", "imported_g", "chain1"],
)
# Same queries as the regex ones above, but using globs instead of
# regex.
self.assertCountEqual(
list(registry.queryCollections(re.compile("imported_."), flattenChains=False)),
list(registry.queryCollections("imported_*", flattenChains=False)),
["imported_r", "imported_g"],
)
# Query for collections matching a regex or an explicit str.
self.assertCountEqual(
list(registry.queryCollections([re.compile("imported_."), "chain1"], flattenChains=False)),
list(registry.queryCollections(["imported_*", "chain1"], flattenChains=False)),
["imported_r", "imported_g", "chain1"],
)

# Search for bias with dataId1 should find it via tag1 in chain2,
# recursing, because is not in run1.
self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=run2))
Expand Down
30 changes: 24 additions & 6 deletions python/lsst/daf/butler/remote_butler/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from collections.abc import Iterable, Iterator, Mapping, Sequence
from typing import Any

from lsst.utils.iteration import ensure_iterable

from .._dataset_association import DatasetAssociation
from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from .._dataset_type import DatasetType
Expand All @@ -51,13 +53,15 @@
CollectionArgType,
CollectionSummary,
CollectionType,
CollectionTypeError,
DatasetTypeError,
Registry,
RegistryDefaults,
)
from ..registry.queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults
from ..remote_butler import RemoteButler
from ._collection_args import convert_collection_arg_to_glob_string_list
from .server_models import QueryCollectionsRequestModel


class RemoteButlerRegistry(Registry):
Expand Down Expand Up @@ -106,7 +110,7 @@
raise NotImplementedError()

def getCollectionType(self, name: str) -> CollectionType:
raise NotImplementedError()
return self._butler._get_collection_info(name).type

def registerRun(self, name: str, doc: str | None = None) -> bool:
raise NotImplementedError()
Expand All @@ -115,22 +119,28 @@
raise NotImplementedError()

def getCollectionChain(self, parent: str) -> Sequence[str]:
raise NotImplementedError()
info = self._butler._get_collection_info(parent)
if info.type is not CollectionType.CHAINED:
raise CollectionTypeError(f"Collection '{parent}' has type {info.type.name}, not CHAINED.")

Check warning on line 124 in python/lsst/daf/butler/remote_butler/_registry.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/remote_butler/_registry.py#L124

Added line #L124 was not covered by tests
return info.children

def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None:
raise NotImplementedError()

def getCollectionParentChains(self, collection: str) -> set[str]:
raise NotImplementedError()
info = self._butler._get_collection_info(collection, include_parents=True)
assert info.parents is not None, "Requested list of parents from server, but it did not send them."
return info.parents

def getCollectionDocumentation(self, collection: str) -> str | None:
raise NotImplementedError()
info = self._butler._get_collection_info(collection, include_doc=True)
return info.doc

def setCollectionDocumentation(self, collection: str, doc: str | None) -> None:
raise NotImplementedError()

def getCollectionSummary(self, collection: str) -> CollectionSummary:
raise NotImplementedError()
return self._butler._get_collection_summary(collection)

def registerDatasetType(self, datasetType: DatasetType) -> bool:
raise NotImplementedError()
Expand Down Expand Up @@ -271,7 +281,15 @@
flattenChains: bool = False,
includeChains: bool | None = None,
) -> Sequence[str]:
raise NotImplementedError()
if includeChains is None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TallJimbo what's the plan for the datasetType parameter? It's not in this implementation and is ignored in sql registry, but should the client code forward it to the server on the assumption that at some point the direct butler might use it? Or are we okay acting like it doesn't exist (the documentation doesn't seem to say that it's ignored and sql_registry mentions DM-24939 which is Done but hasn't fixed the problem).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it gets implemented in DirectButler someday it can get added here at the same time. It sounds like we're leaning towards re-designing this interface entirely, so we can drop it from the new interface or make it actually work there.

I don't like having unused variables/features hanging around because their existence implies to future readers that they are used for something, which can end up wasting a lot of time.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we drop it here, and RFC dropping it from DirectButler (and the CLI) when we RFC dropping regex support there.

includeChains = not flattenChains
query = QueryCollectionsRequestModel(
search=convert_collection_arg_to_glob_string_list(expression),
collection_types=list(ensure_iterable(collectionTypes)),
flatten_chains=flattenChains,
include_chains=includeChains,
)
return self._butler._query_collections(query).collections

def queryDatasets(
self,
Expand Down