Merge pull request #454 from lsst/tickets/DM-27153

DM-27153: default instrument (and skymap) in butler (and registry!), when possible
lsst · Jan 12, 2021 · 22f64db · 22f64db
2 parents 2fe403e + 7b67175
commit 22f64db
Show file tree

Hide file tree

Showing 27 changed files with 1,895 additions and 1,027 deletions.
diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py
@@ -7,7 +7,7 @@
 
 from .core import *
 # Import the registry subpackage directly for other symbols.
-from .registry import Registry, RegistryConfig, CollectionType, CollectionSearch, DatasetTypeRestriction
+from .registry import Registry, RegistryConfig, CollectionType, CollectionSearch
 from ._butlerConfig import *
 from ._deferredDatasetHandle import *
 from ._butler import *

diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
diff --git a/python/lsst/daf/butler/core/dimensions/_coordinate.py b/python/lsst/daf/butler/core/dimensions/_coordinate.py
@@ -43,7 +43,7 @@
 )
 
 from lsst.sphgeom import Region
-from ..named import NamedKeyMapping, NameLookupMapping, NamedValueAbstractSet
+from ..named import NamedKeyDict, NamedKeyMapping, NameLookupMapping, NamedValueAbstractSet
 from ..timespan import Timespan
 from ._elements import Dimension, DimensionElement
 from ._graph import DimensionGraph
@@ -133,6 +133,7 @@ def standardize(
         *,
         graph: Optional[DimensionGraph] = None,
         universe: Optional[DimensionUniverse] = None,
+        defaults: Optional[DataCoordinate] = None,
         **kwargs: Any
     ) -> DataCoordinate:
         """Adapt an arbitrary mapping and/or additional arguments into a true
@@ -145,12 +146,16 @@ def standardize(
             their primary key values (may also be a true `DataCoordinate`).
         graph : `DimensionGraph`
             The dimensions to be identified by the new `DataCoordinate`.
-            If not provided, will be inferred from the keys of ``mapping``,
-            and ``universe`` must be provided unless ``mapping`` is already a
-            `DataCoordinate`.
+            If not provided, will be inferred from the keys of ``mapping`` and
+            ``**kwargs``, and ``universe`` must be provided unless ``mapping``
+            is already a `DataCoordinate`.
         universe : `DimensionUniverse`
             All known dimensions and their relationships; used to expand
             and validate dependencies when ``graph`` is not provided.
+        defaults : `DataCoordinate`, optional
+            Default dimension key-value pairs to use when needed.  These are
+            never used to infer ``graph``, and are ignored if a different value
+            is provided for the same key in ``mapping`` or `**kwargs``.
         **kwargs
             Additional keyword arguments are treated like additional key-value
             pairs in ``mapping``.
@@ -191,11 +196,20 @@ def standardize(
             d.update(mapping)
         d.update(kwargs)
         if graph is None:
-            if universe is None:
+            if defaults is not None:
+                universe = defaults.universe
+            elif universe is None:
                 raise TypeError("universe must be provided if graph is not.")
             graph = DimensionGraph(universe, names=d.keys())
         if not graph.dimensions:
             return DataCoordinate.makeEmpty(graph.universe)
+        if defaults is not None:
+            if defaults.hasFull():
+                for k, v in defaults.full.items():
+                    d.setdefault(k.name, v)
+            else:
+                for k, v in defaults.items():
+                    d.setdefault(k.name, v)
         if d.keys() >= graph.dimensions.names:
             values = tuple(d[name] for name in graph._dataCoordinateIndices.keys())
         else:
@@ -301,7 +315,7 @@ def __repr__(self) -> str:
         # quote its keys: that's both more compact and something that can't
         # be mistaken for an actual dict or something that could be exec'd.
         terms = [f"{d}: {self[d]!r}" for d in self.graph.required.names]
-        if self.hasFull():
+        if self.hasFull() and self.graph.required != self.graph.dimensions:
             terms.append("...")
         return "{{{}}}".format(', '.join(terms))
 
@@ -370,6 +384,31 @@ def subset(self, graph: DimensionGraph) -> DataCoordinate:
         """
         raise NotImplementedError()
 
+    @abstractmethod
+    def union(self, other: DataCoordinate) -> DataCoordinate:
+        """Combine two data IDs, yielding a new one that identifies all
+        dimensions that either of them identify.
+
+        Parameters
+        ----------
+        other : `DataCoordinate`
+            Data ID to combine with ``self``.
+
+        Returns
+        -------
+        unioned : `DataCoordinate`
+            A `DataCoordinate` instance that satisfies
+            ``unioned.graph == self.graph.union(other.graph)``.  Will preserve
+            ``hasFull`` and ``hasRecords`` whenever possible.
+
+        Notes
+        -----
+        No checking for consistency is performed on values for keys that
+        ``self`` and ``other`` have in common, and which value is included in
+        the returned data ID is not specified.
+        """
+        raise NotImplementedError()
+
     @abstractmethod
     def expanded(self, records: NameLookupMapping[DimensionElement, Optional[DimensionRecord]]
                  ) -> DataCoordinate:
@@ -691,7 +730,7 @@ def __getitem__(self, key: DataIdKey) -> DataIdValue:
         except IndexError:
             # Caller asked for an implied dimension, but this object only has
             # values for the required ones.
-            raise KeyError(key)
+            raise KeyError(key) from None
 
     def subset(self, graph: DimensionGraph) -> DataCoordinate:
         # Docstring inherited from DataCoordinate.
@@ -705,6 +744,36 @@ def subset(self, graph: DimensionGraph) -> DataCoordinate:
         else:
             return _BasicTupleDataCoordinate(graph, tuple(self[k] for k in graph.required.names))
 
+    def union(self, other: DataCoordinate) -> DataCoordinate:
+        # Docstring inherited from DataCoordinate.
+        graph = self.graph.union(other.graph)
+        # See if one or both input data IDs is already what we want to return;
+        # if so, return the most complete one we have.
+        if other.graph == graph:
+            if self.graph == graph:
+                # Input data IDs have the same graph (which is also the result
+                # graph), but may not have the same content.
+                # other might have records; self does not, so try other first.
+                # If it at least has full values, it's no worse than self.
+                if other.hasFull():
+                    return other
+                else:
+                    return self
+            elif other.hasFull():
+                return other
+            # There's some chance that neither self nor other has full values,
+            # but together provide enough to the union to.  Let the general
+            # case below handle that.
+        elif self.graph == graph:
+            # No chance at returning records.  If self has full values, it's
+            # the best we can do.
+            if self.hasFull():
+                return self
+        # General case with actual merging of dictionaries.
+        values = self.full.byName() if self.hasFull() else self.byName()
+        values.update(other.full.byName() if other.hasFull() else other.byName())
+        return DataCoordinate.standardize(values, graph=graph)
+
     def expanded(self, records: NameLookupMapping[DimensionElement, Optional[DimensionRecord]]
                  ) -> DataCoordinate:
         # Docstring inherited from DataCoordinate
@@ -775,6 +844,43 @@ def expanded(self, records: NameLookupMapping[DimensionElement, Optional[Dimensi
         # Docstring inherited from DataCoordinate.
         return self
 
+    def union(self, other: DataCoordinate) -> DataCoordinate:
+        # Docstring inherited from DataCoordinate.
+        graph = self.graph.union(other.graph)
+        # See if one or both input data IDs is already what we want to return;
+        # if so, return the most complete one we have.
+        if self.graph == graph:
+            # self has records, so even if other is also a valid result, it's
+            # no better.
+            return self
+        if other.graph == graph:
+            # If other has full values, and self does not identify some of
+            # those, it's the base we can do.  It may have records, too.
+            if other.hasFull():
+                return other
+            # If other does not have full values, there's a chance self may
+            # provide the values needed to complete it.  For example, self
+            # could be {band} while other could be
+            # {instrument, physical_filter, band}, with band unknown.
+        # General case with actual merging of dictionaries.
+        values = self.full.byName()
+        values.update(other.full.byName() if other.hasFull() else other.byName())
+        basic = DataCoordinate.standardize(values, graph=graph)
+        # See if we can add records.
+        if self.hasRecords() and other.hasRecords():
+            # Sometimes the elements of a union of graphs can contain elements
+            # that weren't in either input graph (because graph unions are only
+            # on dimensions).  e.g. {visit} | {detector} brings along
+            # visit_detector_region.
+            elements = set(graph.elements.names)
+            elements -= self.graph.elements.names
+            elements -= other.graph.elements.names
+            if not elements:
+                records = NamedKeyDict[DimensionElement, Optional[DimensionRecord]](self.records)
+                records.update(other.records)
+                return basic.expanded(records.freeze())
+        return basic
+
     def hasFull(self) -> bool:
         # Docstring inherited from DataCoordinate.
         return True

diff --git a/python/lsst/daf/butler/registry/__init__.py b/python/lsst/daf/butler/registry/__init__.py
@@ -20,16 +20,18 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 from ._config import *
+from ._defaults import *
 from ._exceptions import *
 from ._registry import *
 from ._dbAuth import *
 from ._collectionType import *
 
 from . import wildcards
-from .wildcards import CollectionSearch, DatasetTypeRestriction
+from .wildcards import CollectionSearch
 from . import interfaces
-from .interfaces import MissingCollectionError
+from . import managers
 from . import queries
+from . import summaries
 
 # Some modules intentionally not imported, either because they are purely
 # internal (e.g. nameShrinker.py) or they contain implementations that are

diff --git a/python/lsst/daf/butler/registry/_defaults.py b/python/lsst/daf/butler/registry/_defaults.py
@@ -0,0 +1,145 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+__all__ = ("RegistryDefaults",)
+
+from typing import Any, Optional, TYPE_CHECKING
+
+from ..core import DataCoordinate
+from ..core.utils import immutable
+from ._exceptions import MissingCollectionError
+from .summaries import CollectionSummary
+from .wildcards import CollectionSearch
+
+if TYPE_CHECKING:
+    from ._registry import Registry
+
+
+@immutable
+class RegistryDefaults:
+    """A struct used to provide the default collections searched or written to
+    by a `Registry` or `Butler` instance.
+
+    Parameters
+    ----------
+    collections : `str` or `Iterable` [ `str` ], optional
+        An expression specifying the collections to be searched (in order) when
+        reading datasets.  If a default value for a governor dimension is not
+        given via ``**kwargs``, and exactly one value for that dimension
+        appears in the datasets in ``collections``, that value is also used as
+        the default for that dimension.
+        This may be a `str` collection name or an iterable thereof.
+        See :ref:`daf_butler_collection_expressions` for more information.
+        These collections are not registered automatically and must be
+        manually registered before they are used by any `Registry` or `Butler`
+        method, but they may be manually registered after a `Registry` or
+        `Butler` is initialized with this struct.
+    run : `str`, optional
+        Name of the `~CollectionType.RUN` collection new datasets should be
+        inserted into.  If ``collections`` is `None` and ``run`` is not `None`,
+        ``collections`` will be set to ``[run]``.  If not `None`, this
+        collection will automatically be registered when the default struct is
+        attached to a `Registry` instance.
+    infer : `bool`, optional
+        If `True` (default) infer default data ID values from the values
+        present in the datasets in ``collections``: if all collections have the
+        same value (or no value) for a governor dimension, that value will be
+        the default for that dimension.  Nonexistent collections are ignored.
+        If a default value is provided explicitly for a governor dimension via
+        ``**kwargs``, no default will be inferred for that dimension.
+    **kwargs : `str`
+        Default data ID key-value pairs.  These may only identify "governor"
+        dimensions like ``instrument`` and ``skymap``, though this is only
+        checked when the defaults struct is actually attached to a `Registry`.
+    """
+    def __init__(self, collections: Any = None, run: Optional[str] = None, infer: bool = True, **kwargs: str):
+        if collections is None:
+            if run is not None:
+                collections = (run,)
+            else:
+                collections = ()
+        self.collections = CollectionSearch.fromExpression(collections)
+        self.run = run
+        self._infer = infer
+        self._kwargs = kwargs
+
+    def finish(self, registry: Registry) -> None:
+        """Validate the defaults struct and standardize its data ID.
+
+        This should be called only by a `Registry` instance when the defaults
+        struct is first associated with it.
+
+        Parameters
+        ----------
+        registry : `Registry`
+            Registry instance these defaults are being attached to.
+
+        Raises
+        ------
+        TypeError
+            Raised if a non-governor dimension was included in ``**kwargs``
+            at construction.
+        """
+        allGovernorDimensions = registry.dimensions.getGovernorDimensions()
+        if not self._kwargs.keys() <= allGovernorDimensions.names:
+            raise TypeError(
+                "Only governor dimensions may be identified by a default data "
+                f"ID, not {self._kwargs.keys() - allGovernorDimensions.names}.  "
+                "(These may just be unrecognized keyword arguments passed at "
+                "Butler construction.)"
+            )
+        if self._infer and not self._kwargs.keys() == allGovernorDimensions.names:
+            summaries = []
+            for collection in self.collections:
+                try:
+                    summaries.append(registry.getCollectionSummary(collection))
+                except MissingCollectionError:
+                    pass
+            if summaries:
+                summary = CollectionSummary.union(*summaries)
+                for dimensionName in (allGovernorDimensions.names - self._kwargs.keys()):
+                    values = summary.dimensions[dimensionName]
+                    if len(values) == 1:
+                        (value,) = values
+                        self._kwargs[dimensionName] = value
+        self.dataId = registry.expandDataId(self._kwargs, withDefaults=False)
+
+    collections: CollectionSearch
+    """The collections to search by default, in order (`CollectionSearch`).
+    """
+
+    run: Optional[str]
+    """Name of the run this butler writes outputs to by default (`str` or
+    `None`).
+    """
+
+    dataId: DataCoordinate
+    """The default data ID (`DataCoordinate`).
+
+    Dimensions without defaults are simply not included.  Only governor
+    dimensions are ever included in defaults.
+
+    This attribute may not be accessed before the defaults struct is
+    attached to a `Registry` instance.  It always satisfies both ``hasFull``
+    and ``hasRecords``.
+    """
diff --git a/python/lsst/daf/butler/registry/_exceptions.py b/python/lsst/daf/butler/registry/_exceptions.py
@@ -20,7 +20,12 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
-__all__ = ("InconsistentDataIdError", "ConflictingDefinitionError", "OrphanedRecordError")
+__all__ = (
+    "ConflictingDefinitionError",
+    "InconsistentDataIdError",
+    "MissingCollectionError",
+    "OrphanedRecordError",
+)
 
 
 class InconsistentDataIdError(ValueError):
@@ -39,3 +44,9 @@ class OrphanedRecordError(Exception):
     """Exception raised when trying to remove or modify a database record
     that is still being used in some other table.
     """
+
+
+class MissingCollectionError(Exception):
+    """Exception raised when an operation attempts to use a collection that
+    does not exist.
+    """