Skip to content

Commit

Permalink
Merge pull request #454 from lsst/tickets/DM-27153
Browse files Browse the repository at this point in the history
DM-27153: default instrument (and skymap) in butler (and registry!), when possible
  • Loading branch information
TallJimbo committed Jan 12, 2021
2 parents 2fe403e + 7b67175 commit 22f64db
Show file tree
Hide file tree
Showing 27 changed files with 1,895 additions and 1,027 deletions.
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from .core import *
# Import the registry subpackage directly for other symbols.
from .registry import Registry, RegistryConfig, CollectionType, CollectionSearch, DatasetTypeRestriction
from .registry import Registry, RegistryConfig, CollectionType, CollectionSearch
from ._butlerConfig import *
from ._deferredDatasetHandle import *
from ._butler import *
Expand Down
268 changes: 85 additions & 183 deletions python/lsst/daf/butler/_butler.py

Large diffs are not rendered by default.

120 changes: 113 additions & 7 deletions python/lsst/daf/butler/core/dimensions/_coordinate.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
)

from lsst.sphgeom import Region
from ..named import NamedKeyMapping, NameLookupMapping, NamedValueAbstractSet
from ..named import NamedKeyDict, NamedKeyMapping, NameLookupMapping, NamedValueAbstractSet
from ..timespan import Timespan
from ._elements import Dimension, DimensionElement
from ._graph import DimensionGraph
Expand Down Expand Up @@ -133,6 +133,7 @@ def standardize(
*,
graph: Optional[DimensionGraph] = None,
universe: Optional[DimensionUniverse] = None,
defaults: Optional[DataCoordinate] = None,
**kwargs: Any
) -> DataCoordinate:
"""Adapt an arbitrary mapping and/or additional arguments into a true
Expand All @@ -145,12 +146,16 @@ def standardize(
their primary key values (may also be a true `DataCoordinate`).
graph : `DimensionGraph`
The dimensions to be identified by the new `DataCoordinate`.
If not provided, will be inferred from the keys of ``mapping``,
and ``universe`` must be provided unless ``mapping`` is already a
`DataCoordinate`.
If not provided, will be inferred from the keys of ``mapping`` and
``**kwargs``, and ``universe`` must be provided unless ``mapping``
is already a `DataCoordinate`.
universe : `DimensionUniverse`
All known dimensions and their relationships; used to expand
and validate dependencies when ``graph`` is not provided.
defaults : `DataCoordinate`, optional
Default dimension key-value pairs to use when needed. These are
never used to infer ``graph``, and are ignored if a different value
is provided for the same key in ``mapping`` or `**kwargs``.
**kwargs
Additional keyword arguments are treated like additional key-value
pairs in ``mapping``.
Expand Down Expand Up @@ -191,11 +196,20 @@ def standardize(
d.update(mapping)
d.update(kwargs)
if graph is None:
if universe is None:
if defaults is not None:
universe = defaults.universe
elif universe is None:
raise TypeError("universe must be provided if graph is not.")
graph = DimensionGraph(universe, names=d.keys())
if not graph.dimensions:
return DataCoordinate.makeEmpty(graph.universe)
if defaults is not None:
if defaults.hasFull():
for k, v in defaults.full.items():
d.setdefault(k.name, v)
else:
for k, v in defaults.items():
d.setdefault(k.name, v)
if d.keys() >= graph.dimensions.names:
values = tuple(d[name] for name in graph._dataCoordinateIndices.keys())
else:
Expand Down Expand Up @@ -301,7 +315,7 @@ def __repr__(self) -> str:
# quote its keys: that's both more compact and something that can't
# be mistaken for an actual dict or something that could be exec'd.
terms = [f"{d}: {self[d]!r}" for d in self.graph.required.names]
if self.hasFull():
if self.hasFull() and self.graph.required != self.graph.dimensions:
terms.append("...")
return "{{{}}}".format(', '.join(terms))

Expand Down Expand Up @@ -370,6 +384,31 @@ def subset(self, graph: DimensionGraph) -> DataCoordinate:
"""
raise NotImplementedError()

@abstractmethod
def union(self, other: DataCoordinate) -> DataCoordinate:
"""Combine two data IDs, yielding a new one that identifies all
dimensions that either of them identify.
Parameters
----------
other : `DataCoordinate`
Data ID to combine with ``self``.
Returns
-------
unioned : `DataCoordinate`
A `DataCoordinate` instance that satisfies
``unioned.graph == self.graph.union(other.graph)``. Will preserve
``hasFull`` and ``hasRecords`` whenever possible.
Notes
-----
No checking for consistency is performed on values for keys that
``self`` and ``other`` have in common, and which value is included in
the returned data ID is not specified.
"""
raise NotImplementedError()

@abstractmethod
def expanded(self, records: NameLookupMapping[DimensionElement, Optional[DimensionRecord]]
) -> DataCoordinate:
Expand Down Expand Up @@ -691,7 +730,7 @@ def __getitem__(self, key: DataIdKey) -> DataIdValue:
except IndexError:
# Caller asked for an implied dimension, but this object only has
# values for the required ones.
raise KeyError(key)
raise KeyError(key) from None

def subset(self, graph: DimensionGraph) -> DataCoordinate:
# Docstring inherited from DataCoordinate.
Expand All @@ -705,6 +744,36 @@ def subset(self, graph: DimensionGraph) -> DataCoordinate:
else:
return _BasicTupleDataCoordinate(graph, tuple(self[k] for k in graph.required.names))

def union(self, other: DataCoordinate) -> DataCoordinate:
# Docstring inherited from DataCoordinate.
graph = self.graph.union(other.graph)
# See if one or both input data IDs is already what we want to return;
# if so, return the most complete one we have.
if other.graph == graph:
if self.graph == graph:
# Input data IDs have the same graph (which is also the result
# graph), but may not have the same content.
# other might have records; self does not, so try other first.
# If it at least has full values, it's no worse than self.
if other.hasFull():
return other
else:
return self
elif other.hasFull():
return other
# There's some chance that neither self nor other has full values,
# but together provide enough to the union to. Let the general
# case below handle that.
elif self.graph == graph:
# No chance at returning records. If self has full values, it's
# the best we can do.
if self.hasFull():
return self
# General case with actual merging of dictionaries.
values = self.full.byName() if self.hasFull() else self.byName()
values.update(other.full.byName() if other.hasFull() else other.byName())
return DataCoordinate.standardize(values, graph=graph)

def expanded(self, records: NameLookupMapping[DimensionElement, Optional[DimensionRecord]]
) -> DataCoordinate:
# Docstring inherited from DataCoordinate
Expand Down Expand Up @@ -775,6 +844,43 @@ def expanded(self, records: NameLookupMapping[DimensionElement, Optional[Dimensi
# Docstring inherited from DataCoordinate.
return self

def union(self, other: DataCoordinate) -> DataCoordinate:
# Docstring inherited from DataCoordinate.
graph = self.graph.union(other.graph)
# See if one or both input data IDs is already what we want to return;
# if so, return the most complete one we have.
if self.graph == graph:
# self has records, so even if other is also a valid result, it's
# no better.
return self
if other.graph == graph:
# If other has full values, and self does not identify some of
# those, it's the base we can do. It may have records, too.
if other.hasFull():
return other
# If other does not have full values, there's a chance self may
# provide the values needed to complete it. For example, self
# could be {band} while other could be
# {instrument, physical_filter, band}, with band unknown.
# General case with actual merging of dictionaries.
values = self.full.byName()
values.update(other.full.byName() if other.hasFull() else other.byName())
basic = DataCoordinate.standardize(values, graph=graph)
# See if we can add records.
if self.hasRecords() and other.hasRecords():
# Sometimes the elements of a union of graphs can contain elements
# that weren't in either input graph (because graph unions are only
# on dimensions). e.g. {visit} | {detector} brings along
# visit_detector_region.
elements = set(graph.elements.names)
elements -= self.graph.elements.names
elements -= other.graph.elements.names
if not elements:
records = NamedKeyDict[DimensionElement, Optional[DimensionRecord]](self.records)
records.update(other.records)
return basic.expanded(records.freeze())
return basic

def hasFull(self) -> bool:
# Docstring inherited from DataCoordinate.
return True
Expand Down
6 changes: 4 additions & 2 deletions python/lsst/daf/butler/registry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,18 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from ._config import *
from ._defaults import *
from ._exceptions import *
from ._registry import *
from ._dbAuth import *
from ._collectionType import *

from . import wildcards
from .wildcards import CollectionSearch, DatasetTypeRestriction
from .wildcards import CollectionSearch
from . import interfaces
from .interfaces import MissingCollectionError
from . import managers
from . import queries
from . import summaries

# Some modules intentionally not imported, either because they are purely
# internal (e.g. nameShrinker.py) or they contain implementations that are
Expand Down
145 changes: 145 additions & 0 deletions python/lsst/daf/butler/registry/_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

__all__ = ("RegistryDefaults",)

from typing import Any, Optional, TYPE_CHECKING

from ..core import DataCoordinate
from ..core.utils import immutable
from ._exceptions import MissingCollectionError
from .summaries import CollectionSummary
from .wildcards import CollectionSearch

if TYPE_CHECKING:
from ._registry import Registry


@immutable
class RegistryDefaults:
"""A struct used to provide the default collections searched or written to
by a `Registry` or `Butler` instance.
Parameters
----------
collections : `str` or `Iterable` [ `str` ], optional
An expression specifying the collections to be searched (in order) when
reading datasets. If a default value for a governor dimension is not
given via ``**kwargs``, and exactly one value for that dimension
appears in the datasets in ``collections``, that value is also used as
the default for that dimension.
This may be a `str` collection name or an iterable thereof.
See :ref:`daf_butler_collection_expressions` for more information.
These collections are not registered automatically and must be
manually registered before they are used by any `Registry` or `Butler`
method, but they may be manually registered after a `Registry` or
`Butler` is initialized with this struct.
run : `str`, optional
Name of the `~CollectionType.RUN` collection new datasets should be
inserted into. If ``collections`` is `None` and ``run`` is not `None`,
``collections`` will be set to ``[run]``. If not `None`, this
collection will automatically be registered when the default struct is
attached to a `Registry` instance.
infer : `bool`, optional
If `True` (default) infer default data ID values from the values
present in the datasets in ``collections``: if all collections have the
same value (or no value) for a governor dimension, that value will be
the default for that dimension. Nonexistent collections are ignored.
If a default value is provided explicitly for a governor dimension via
``**kwargs``, no default will be inferred for that dimension.
**kwargs : `str`
Default data ID key-value pairs. These may only identify "governor"
dimensions like ``instrument`` and ``skymap``, though this is only
checked when the defaults struct is actually attached to a `Registry`.
"""
def __init__(self, collections: Any = None, run: Optional[str] = None, infer: bool = True, **kwargs: str):
if collections is None:
if run is not None:
collections = (run,)
else:
collections = ()
self.collections = CollectionSearch.fromExpression(collections)
self.run = run
self._infer = infer
self._kwargs = kwargs

def finish(self, registry: Registry) -> None:
"""Validate the defaults struct and standardize its data ID.
This should be called only by a `Registry` instance when the defaults
struct is first associated with it.
Parameters
----------
registry : `Registry`
Registry instance these defaults are being attached to.
Raises
------
TypeError
Raised if a non-governor dimension was included in ``**kwargs``
at construction.
"""
allGovernorDimensions = registry.dimensions.getGovernorDimensions()
if not self._kwargs.keys() <= allGovernorDimensions.names:
raise TypeError(
"Only governor dimensions may be identified by a default data "
f"ID, not {self._kwargs.keys() - allGovernorDimensions.names}. "
"(These may just be unrecognized keyword arguments passed at "
"Butler construction.)"
)
if self._infer and not self._kwargs.keys() == allGovernorDimensions.names:
summaries = []
for collection in self.collections:
try:
summaries.append(registry.getCollectionSummary(collection))
except MissingCollectionError:
pass
if summaries:
summary = CollectionSummary.union(*summaries)
for dimensionName in (allGovernorDimensions.names - self._kwargs.keys()):
values = summary.dimensions[dimensionName]
if len(values) == 1:
(value,) = values
self._kwargs[dimensionName] = value
self.dataId = registry.expandDataId(self._kwargs, withDefaults=False)

collections: CollectionSearch
"""The collections to search by default, in order (`CollectionSearch`).
"""

run: Optional[str]
"""Name of the run this butler writes outputs to by default (`str` or
`None`).
"""

dataId: DataCoordinate
"""The default data ID (`DataCoordinate`).
Dimensions without defaults are simply not included. Only governor
dimensions are ever included in defaults.
This attribute may not be accessed before the defaults struct is
attached to a `Registry` instance. It always satisfies both ``hasFull``
and ``hasRecords``.
"""
13 changes: 12 additions & 1 deletion python/lsst/daf/butler/registry/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.


__all__ = ("InconsistentDataIdError", "ConflictingDefinitionError", "OrphanedRecordError")
__all__ = (
"ConflictingDefinitionError",
"InconsistentDataIdError",
"MissingCollectionError",
"OrphanedRecordError",
)


class InconsistentDataIdError(ValueError):
Expand All @@ -39,3 +44,9 @@ class OrphanedRecordError(Exception):
"""Exception raised when trying to remove or modify a database record
that is still being used in some other table.
"""


class MissingCollectionError(Exception):
"""Exception raised when an operation attempts to use a collection that
does not exist.
"""

0 comments on commit 22f64db

Please sign in to comment.