lsst · timj · Jan 4, 2022 · Dec 29, 2021 · Dec 29, 2021 · Dec 29, 2021
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -670,7 +670,7 @@ def _rewrite_data_id(
             no keyword arguments, the original dataId will be returned
             unchanged.
         **kwargs : `dict`
-            Any unused keyword arguments.
+            Any unused keyword arguments (would normally be empty dict).
         """
         # Do nothing if we have a standalone DataCoordinate.
         if isinstance(dataId, DataCoordinate) and not kwargs:
@@ -744,6 +744,18 @@ def _rewrite_data_id(
                             dimension.primaryKey.getPythonType(),
                         )
 
+        # By this point kwargs and newDataId should only include valid
+        # dimensions. Merge kwargs in to the new dataId and log if there
+        # are dimensions in both (rather than calling update).
+        for k, v in kwargs.items():
+            if k in newDataId and newDataId[k] != v:
+                log.debug(
+                    "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
+                )
+            newDataId[k] = v
+        # No need to retain any values in kwargs now.
+        kwargs = {}
+
         # If we have some unrecognized dimensions we have to try to connect
         # them to records in other dimensions.  This is made more complicated
         # by some dimensions having records with clashing names.  A mitigation
@@ -752,9 +764,14 @@ def _rewrite_data_id(
         # where additional dimensions can be used to constrain the temporal
         # axis.
         if not_dimensions:
-            # Calculate missing dimensions
-            provided = set(newDataId) | set(kwargs) | set(byRecord)
-            missingDimensions = datasetType.dimensions.names - provided
+            # Search for all dimensions even if we have been given a value
+            # explicitly. In some cases records are given as well as the
+            # actually dimension and this should not be an error if they
+            # match.
+            mandatoryDimensions = datasetType.dimensions.names  # - provided
+
+            candidateDimensions: Set[str] = set()
+            candidateDimensions.update(mandatoryDimensions)
 
             # For calibrations we may well be needing temporal dimensions
             # so rather than always including all dimensions in the scan
@@ -763,8 +780,6 @@ def _rewrite_data_id(
             # If we are not searching calibration collections things may
             # fail but they are going to fail anyway because of the
             # ambiguousness of the dataId...
-            candidateDimensions: Set[str] = set()
-            candidateDimensions.update(missingDimensions)
             if datasetType.isCalibration():
                 for dim in self.registry.dimensions.getStaticDimensions():
                     if dim.temporal:
@@ -807,7 +822,7 @@ def _rewrite_data_id(
             for fieldName, assignedDimensions in assigned.items():
                 if len(assignedDimensions) > 1:
                     # Pick the most popular (preferring mandatory dimensions)
-                    requiredButMissing = assignedDimensions.intersection(missingDimensions)
+                    requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
                     if requiredButMissing:
                         candidateDimensions = requiredButMissing
                     else:
@@ -854,6 +869,29 @@ def _rewrite_data_id(
                         newDataId[dimensionName],
                         str(values),
                     )
+                    # Get the actual record and compare with these values.
+                    try:
+                        recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
+                    except LookupError:
+                        raise ValueError(
+                            f"Could not find dimension '{dimensionName}'"
+                            f" with dataId {newDataId} as part of comparing with"
+                            f" record values {byRecord[dimensionName]}"
+                        ) from None
+                    if len(recs) == 1:
+                        errmsg: List[str] = []
+                        for k, v in values.items():
+                            if (recval := getattr(recs[0], k)) != v:
+                                errmsg.append(f"{k}({recval} != {v})")
+                        if errmsg:
+                            raise ValueError(
+                                f"Dimension {dimensionName} in dataId has explicit value"
+                                " inconsistent with records: " + ", ".join(errmsg)
+                            )
+                    else:
+                        # Multiple matches for an explicit dimension
+                        # should never happen but let downstream complain.
+                        pass
                     continue
 
                 # Build up a WHERE expression
@@ -872,12 +910,12 @@ def _rewrite_data_id(
                         log.debug("Received %d records from constraints of %s", len(records), str(values))
                         for r in records:
                             log.debug("- %s", str(r))
-                        raise RuntimeError(
+                        raise ValueError(
                             f"DataId specification for dimension {dimensionName} is not"
                             f" uniquely constrained to a single dataset by {values}."
                             f" Got {len(records)} results."
                         )
-                    raise RuntimeError(
+                    raise ValueError(
                         f"DataId specification for dimension {dimensionName} matched no"
                         f" records when constrained by {values}"
                     )
@@ -890,10 +928,7 @@ def _rewrite_data_id(
                     )
                 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
 
-            # We have modified the dataId so need to switch to it
-            dataId = newDataId
-
-        return dataId, kwargs
+        return newDataId, kwargs
 
     def _findDatasetRef(
         self,

diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py
@@ -368,6 +368,11 @@ def testButlerGet(self):
                 },
                 {},
             ),
+            # Duplicate (but valid) information.
+            (None, {"instrument": "Cam1", "detector": 2, "raft": "A", "physical_filter": "Cam1-G"}),
+            ({"detector": 2}, {"instrument": "Cam1", "raft": "A", "physical_filter": "Cam1-G"}),
+            ({"raft": "A"}, {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G"}),
+            ({"raft": "A"}, {"instrument": "Cam1", "detector": "Ab", "physical_filter": "Cam1-G"}),
         )
 
         for dataId, kwds in variants:
@@ -377,6 +382,24 @@ def testButlerGet(self):
                 raise type(e)(f"{str(e)}: dataId={dataId}, kwds={kwds}") from e
             self.assertEqual(flat_id, flat2g.id, msg=f"DataId: {dataId}, kwds: {kwds}")
 
+        # Check that bad combinations raise.
+        variants = (
+            # Inconsistent detector information.
+            (None, {"instrument": "Cam1", "detector": 2, "raft": "B", "physical_filter": "Cam1-G"}),
+            ({"detector": 2}, {"instrument": "Cam1", "raft": "B", "physical_filter": "Cam1-G"}),
+            ({"detector": 12}, {"instrument": "Cam1", "raft": "B", "physical_filter": "Cam1-G"}),
+            ({"raft": "B"}, {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G"}),
+            ({"raft": "B"}, {"instrument": "Cam1", "detector": "Ab", "physical_filter": "Cam1-G"}),
+            # Under-specified.
+            ({"raft": "B"}, {"instrument": "Cam1", "physical_filter": "Cam1-G"}),
+            # Spurious kwargs.
+            (None, {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G", "x": "y"}),
+            ({"x": "y"}, {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G"}),
+        )
+        for dataId, kwds in variants:
+            with self.assertRaises(ValueError):
+                butler.get("flat", dataId=dataId, collections=coll, **kwds)
+
     def testGetCalibration(self):
         """Test that `Butler.get` can be used to fetch from
         `~CollectionType.CALIBRATION` collections if the data ID includes
@@ -476,6 +499,33 @@ def testGetCalibration(self):
         )
         self.assertEqual(bias3b_id, bias3b.id)
 
+        # Allow a fully-specified dataId and unnecessary extra information
+        # that comes from the record.
+        bias3b_id, _ = butler.get(
+            "bias",
+            dataId=dict(
+                exposure=4,
+                day_obs=20211114,
+                seq_num=42,
+                detector=3,
+                instrument="Cam1",
+            ),
+            collections="calibs",
+        )
+        self.assertEqual(bias3b_id, bias3b.id)
+
+        # Extra but inconsistent record values are a problem.
+        with self.assertRaises(ValueError):
+            bias3b_id, _ = butler.get(
+                "bias",
+                exposure=3,
+                day_obs=20211114,
+                seq_num=42,
+                detector=3,
+                collections="calibs",
+                instrument="Cam1",
+            )
+
         # Ensure that spurious kwargs cause an exception.
         with self.assertRaises(ValueError):
             butler.get(