lsst · TallJimbo · Jan 6, 2023 · Jul 22, 2022 · Aug 28, 2022 · Aug 18, 2022
diff --git a/doc/changes/DM-31725.misc.md b/doc/changes/DM-31725.misc.md
@@ -0,0 +1,9 @@
+Rewrite registry query system, using the new ``daf_relation`` package.
+
+This change should be mostly invisible to users, but there are some subtle behavior changes:
+
+- `Registry.findDatasets` now respects the given storage class when passed a full `DatasetType` instance, instead of replacing it with storage class registered with that dataset type.  This causes storage class overrides in `PipelineTask` input connections to be respected in more contexts as well; in at least some cases these were previously being incorrectly ignored.
+- `Registry.findDatasets` now utilizes cached summaries of which dataset types and governor dimension values are present in each collection.  This should result in fewer and simpler database calls, but it does make the result vulnerable to stale caches (which, like `Registry` methods more generally, must be addressed manually via calls to `Registry.refresh`.
+- The diagnostics provided by the `explain_no_results` methods on query result object (used prominently in the reporting on empty quantum graph builds) have been significantly improved, though they now use ``daf_relation`` terminology that may be unfamiliar to users.
+- `Registry` is now more consistent about raising `DataIdValueError` when given invalid governor dimension values, while not raising (but providing `explain_no_results` diagnostics) for all other invalid dimension values, as per RFC-878.
+- `Registry` methods that take a `where` argument are now typed to expect a `str` that is not `None`, with the default no-op value now an empty string (before either an empty `str` or `None` could be passed, and meant the same thing).  This should only affect downstream type checking, as the runtime code still just checks for whether the argument evaluates as `False` in a boolean context.
diff --git a/mypy.ini b/mypy.ini
@@ -58,6 +58,10 @@ ignore_errors = True
 ignore_missing_imports = False
 ignore_errors = True
 
+[mypy-lsst.daf_relation.*]
+ignore_missing_imports = False
+ignore_errors = True
+
 # Check all of daf.butler...
 
 [mypy-lsst.daf.butler.*]

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     "lsst-sphgeom",
     "lsst-utils",
     "lsst-resources",
+    "lsst-daf-relation",
     "deprecated >= 1.2",
     "pydantic",
 ]

diff --git a/python/lsst/daf/butler/cli/opt/options.py b/python/lsst/daf/butler/cli/opt/options.py
@@ -256,7 +256,9 @@ def _config_split(*args: Any) -> dict[str, str]:
 verbose_option = MWOptionDecorator("-v", "--verbose", help="Increase verbosity.", is_flag=True)
 
 
-where_option = MWOptionDecorator("--where", help="A string expression similar to a SQL WHERE clause.")
+where_option = MWOptionDecorator(
+    "--where", default="", help="A string expression similar to a SQL WHERE clause."
+)
 
 
 order_by_option = MWOptionDecorator(

diff --git a/python/lsst/daf/butler/core/__init__.py b/python/lsst/daf/butler/core/__init__.py
@@ -5,6 +5,8 @@
 from . import progress  # most symbols are only used by handler implementors
 from . import ddl, time_utils
 from ._butlerUri import *
+from ._column_categorization import *
+from ._column_tags import *
 from ._column_type_info import *
 from ._topology import *
 from .composites import *
@@ -30,7 +32,6 @@
 from .named import *
 from .progress import Progress
 from .quantum import *
-from .simpleQuery import *
 from .storageClass import *
 from .storageClassDelegate import *
 from .storedFileInfo import *

diff --git a/python/lsst/daf/butler/core/_column_categorization.py b/python/lsst/daf/butler/core/_column_categorization.py
@@ -0,0 +1,77 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+__all__ = ("ColumnCategorization",)
+
+import dataclasses
+from collections import defaultdict
+from collections.abc import Iterable, Iterator
+from typing import Any
+
+from ._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag
+from .dimensions import DimensionUniverse, GovernorDimension, SkyPixDimension
+
+
+@dataclasses.dataclass
+class ColumnCategorization:
+    dimension_keys: set[str] = dataclasses.field(default_factory=set)
+    dimension_records: defaultdict[str, set[str]] = dataclasses.field(
+        default_factory=lambda: defaultdict(set)
+    )
+    datasets: defaultdict[str, set[str]] = dataclasses.field(default_factory=lambda: defaultdict(set))
+
+    @classmethod
+    def from_iterable(cls, iterable: Iterable[Any]) -> ColumnCategorization:
+        result = cls()
+        for tag in iterable:
+            match tag:
+                case DimensionKeyColumnTag(dimension=dimension):
+                    result.dimension_keys.add(dimension)
+                case DimensionRecordColumnTag(element=element, column=column):
+                    result.dimension_records[element].add(column)
+                case DatasetColumnTag(dataset_type=dataset_type, column=column):
+                    result.datasets[dataset_type].add(column)
+        return result
+
+    def filter_skypix(self, universe: DimensionUniverse) -> Iterator[SkyPixDimension]:
+        return (
+            dimension
+            for name in self.dimension_keys
+            if isinstance(dimension := universe[name], SkyPixDimension)
+        )
+
+    def filter_governors(self, universe: DimensionUniverse) -> Iterator[GovernorDimension]:
+        return (
+            dimension
+            for name in self.dimension_keys
+            if isinstance(dimension := universe[name], GovernorDimension)
+        )
+
+    def filter_timespan_dataset_types(self) -> Iterator[str]:
+        return (dataset_type for dataset_type, columns in self.datasets.items() if "timespan" in columns)
+
+    def filter_timespan_dimension_elements(self) -> Iterator[str]:
+        return (element for element, columns in self.dimension_records.items() if "timespan" in columns)
+
+    def filter_spatial_region_dimension_elements(self) -> Iterator[str]:
+        return (element for element, columns in self.dimension_records.items() if "region" in columns)
diff --git a/python/lsst/daf/butler/core/_column_tags.py b/python/lsst/daf/butler/core/_column_tags.py
@@ -0,0 +1,205 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+__all__ = (
+    "DatasetColumnTag",
+    "DimensionKeyColumnTag",
+    "DimensionRecordColumnTag",
+    "is_timespan_column",
+)
+
+import dataclasses
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Any, TypeVar, final
+
+_S = TypeVar("_S")
+
+if TYPE_CHECKING:
+    from lsst.daf.relation import ColumnTag
+
+
+class _BaseColumnTag:
+
+    __slots__ = ()
+
+    @classmethod
+    def filter_from(cls: type[_S], tags: Iterable[Any]) -> set[_S]:
+        return {tag for tag in tags if type(tag) is cls}
+
+
+@final
+@dataclasses.dataclass(frozen=True, slots=True)
+class DimensionKeyColumnTag(_BaseColumnTag):
+    """An identifier for `~lsst.daf.relation.Relation` columns that represent
+    a dimension primary key value.
+    """
+
+    dimension: str
+    """Name of the dimension (`str`)."""
+
+    def __str__(self) -> str:
+        return self.dimension
+
+    @property
+    def qualified_name(self) -> str:
+        return self.dimension
+
+    @property
+    def is_key(self) -> bool:
+        return True
+
+    @classmethod
+    def generate(cls, dimensions: Iterable[str]) -> list[DimensionKeyColumnTag]:
+        """Return a list of column tags from an iterable of dimension
+        names.
+
+        Parameters
+        ----------
+        dimensions : `Iterable` [ `str` ]
+            Dimension names.
+
+        Returns
+        -------
+        tags : `list` [ `DimensionKeyColumnTag` ]
+            List of column tags.
+        """
+        return [cls(d) for d in dimensions]
+
+
+@final
+@dataclasses.dataclass(frozen=True, slots=True)
+class DimensionRecordColumnTag(_BaseColumnTag):
+    """An identifier for `~lsst.daf.relation.Relation` columns that represent
+    non-key columns in a dimension or dimension element record.
+    """
+
+    element: str
+    """Name of the dimension element (`str`).
+    """
+
+    column: str
+    """Name of the column (`str`)."""
+
+    def __str__(self) -> str:
+        return f"{self.element}.{self.column}"
+
+    @property
+    def qualified_name(self) -> str:
+        return f"n!{self.element}:{self.column}"
+
+    @property
+    def is_key(self) -> bool:
+        return False
+
+    @classmethod
+    def generate(cls, element: str, columns: Iterable[str]) -> list[DimensionRecordColumnTag]:
+        """Return a list of column tags from an iterable of column names
+        for a single dimension element.
+
+        Parameters
+        ----------
+        element : `str`
+            Name of the dimension element.
+        columns : `Iterable` [ `str` ]
+            Column names.
+
+        Returns
+        -------
+        tags : `list` [ `DimensionRecordColumnTag` ]
+            List of column tags.
+        """
+        return [cls(element, column) for column in columns]
+
+
+@final
+@dataclasses.dataclass(frozen=True, slots=True)
+class DatasetColumnTag(_BaseColumnTag):
+    """An identifier for `~lsst.daf.relation.Relation` columns that represent
+    columns from a dataset query or subquery.
+    """
+
+    dataset_type: str
+    """Name of the dataset type (`str`)."""
+
+    column: str
+    """Name of the column (`str`).
+
+    Allowed values are:
+
+    - "dataset_id" (autoincrement or UUID primary key)
+    - "run" (collection primary key, not collection name)
+    - "ingest_date"
+    - "timespan" (validity range, or NULL for non-calibration collections)
+    - "rank" (collection position in ordered search)
+    """
+
+    def __str__(self) -> str:
+        return f"{self.dataset_type}.{self.column}"
+
+    @property
+    def qualified_name(self) -> str:
+        return f"t!{self.dataset_type}:{self.column}"
+
+    @property
+    def is_key(self) -> bool:
+        return self.column == "dataset_id" or self.column == "run"
+
+    @classmethod
+    def generate(cls, dataset_type: str, columns: Iterable[str]) -> list[DatasetColumnTag]:
+        """Return a list of column tags from an iterable of column names
+        for a single dataset type.
+
+        Parameters
+        ----------
+        dataset_type : `str`
+            Name of the dataset type.
+        columns : `Iterable` [ `str` ]
+            Column names.
+
+        Returns
+        -------
+        tags : `list` [ `DatasetColumnTag` ]
+            List of column tags.
+        """
+        return [cls(dataset_type, column) for column in columns]
+
+
+def is_timespan_column(tag: ColumnTag) -> bool:
+    """Test whether a column tag is a timespan.
+
+    Parameters
+    ----------
+    tag : `ColumnTag`
+        Column tag to test.
+
+    Returns
+    -------
+    is_timespan : `bool`
+        Whether the given column is a timespan.
+    """
+    match tag:
+        case DimensionRecordColumnTag(column="timespan"):
+            return True
+        case DatasetColumnTag(column="timespan"):
+            return True
+    return False