Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-33049: Allow duplicate values in dataId #622

Merged
merged 6 commits into from
Jan 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
61 changes: 48 additions & 13 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ def _rewrite_data_id(
no keyword arguments, the original dataId will be returned
unchanged.
**kwargs : `dict`
Any unused keyword arguments.
Any unused keyword arguments (would normally be empty dict).
"""
# Do nothing if we have a standalone DataCoordinate.
if isinstance(dataId, DataCoordinate) and not kwargs:
Expand Down Expand Up @@ -744,6 +744,18 @@ def _rewrite_data_id(
dimension.primaryKey.getPythonType(),
)

# By this point kwargs and newDataId should only include valid
# dimensions. Merge kwargs in to the new dataId and log if there
# are dimensions in both (rather than calling update).
for k, v in kwargs.items():
if k in newDataId and newDataId[k] != v:
log.debug(
"Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
)
newDataId[k] = v
# No need to retain any values in kwargs now.
kwargs = {}

# If we have some unrecognized dimensions we have to try to connect
# them to records in other dimensions. This is made more complicated
# by some dimensions having records with clashing names. A mitigation
Expand All @@ -752,9 +764,14 @@ def _rewrite_data_id(
# where additional dimensions can be used to constrain the temporal
# axis.
if not_dimensions:
# Calculate missing dimensions
provided = set(newDataId) | set(kwargs) | set(byRecord)
missingDimensions = datasetType.dimensions.names - provided
# Search for all dimensions even if we have been given a value
# explicitly. In some cases records are given as well as the
# actually dimension and this should not be an error if they
# match.
mandatoryDimensions = datasetType.dimensions.names # - provided

candidateDimensions: Set[str] = set()
candidateDimensions.update(mandatoryDimensions)

# For calibrations we may well be needing temporal dimensions
# so rather than always including all dimensions in the scan
Expand All @@ -763,8 +780,6 @@ def _rewrite_data_id(
# If we are not searching calibration collections things may
# fail but they are going to fail anyway because of the
# ambiguousness of the dataId...
candidateDimensions: Set[str] = set()
candidateDimensions.update(missingDimensions)
if datasetType.isCalibration():
for dim in self.registry.dimensions.getStaticDimensions():
if dim.temporal:
Expand Down Expand Up @@ -807,7 +822,7 @@ def _rewrite_data_id(
for fieldName, assignedDimensions in assigned.items():
if len(assignedDimensions) > 1:
# Pick the most popular (preferring mandatory dimensions)
requiredButMissing = assignedDimensions.intersection(missingDimensions)
requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
if requiredButMissing:
candidateDimensions = requiredButMissing
else:
Expand Down Expand Up @@ -854,6 +869,29 @@ def _rewrite_data_id(
newDataId[dimensionName],
str(values),
)
# Get the actual record and compare with these values.
try:
recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
except LookupError:
raise ValueError(
f"Could not find dimension '{dimensionName}'"
f" with dataId {newDataId} as part of comparing with"
f" record values {byRecord[dimensionName]}"
) from None
if len(recs) == 1:
errmsg: List[str] = []
for k, v in values.items():
if (recval := getattr(recs[0], k)) != v:
errmsg.append(f"{k}({recval} != {v})")
if errmsg:
raise ValueError(
f"Dimension {dimensionName} in dataId has explicit value"
" inconsistent with records: " + ", ".join(errmsg)
)
else:
# Multiple matches for an explicit dimension
# should never happen but let downstream complain.
pass
continue

# Build up a WHERE expression
Expand All @@ -872,12 +910,12 @@ def _rewrite_data_id(
log.debug("Received %d records from constraints of %s", len(records), str(values))
for r in records:
log.debug("- %s", str(r))
raise RuntimeError(
raise ValueError(
f"DataId specification for dimension {dimensionName} is not"
f" uniquely constrained to a single dataset by {values}."
f" Got {len(records)} results."
)
raise RuntimeError(
raise ValueError(
f"DataId specification for dimension {dimensionName} matched no"
f" records when constrained by {values}"
)
Expand All @@ -890,10 +928,7 @@ def _rewrite_data_id(
)
newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)

# We have modified the dataId so need to switch to it
dataId = newDataId

return dataId, kwargs
return newDataId, kwargs

def _findDatasetRef(
self,
Expand Down
50 changes: 50 additions & 0 deletions tests/test_simpleButler.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,11 @@ def testButlerGet(self):
},
{},
),
# Duplicate (but valid) information.
(None, {"instrument": "Cam1", "detector": 2, "raft": "A", "physical_filter": "Cam1-G"}),
({"detector": 2}, {"instrument": "Cam1", "raft": "A", "physical_filter": "Cam1-G"}),
({"raft": "A"}, {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G"}),
({"raft": "A"}, {"instrument": "Cam1", "detector": "Ab", "physical_filter": "Cam1-G"}),
)

for dataId, kwds in variants:
Expand All @@ -377,6 +382,24 @@ def testButlerGet(self):
raise type(e)(f"{str(e)}: dataId={dataId}, kwds={kwds}") from e
self.assertEqual(flat_id, flat2g.id, msg=f"DataId: {dataId}, kwds: {kwds}")

# Check that bad combinations raise.
variants = (
# Inconsistent detector information.
(None, {"instrument": "Cam1", "detector": 2, "raft": "B", "physical_filter": "Cam1-G"}),
({"detector": 2}, {"instrument": "Cam1", "raft": "B", "physical_filter": "Cam1-G"}),
({"detector": 12}, {"instrument": "Cam1", "raft": "B", "physical_filter": "Cam1-G"}),
({"raft": "B"}, {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G"}),
({"raft": "B"}, {"instrument": "Cam1", "detector": "Ab", "physical_filter": "Cam1-G"}),
# Under-specified.
({"raft": "B"}, {"instrument": "Cam1", "physical_filter": "Cam1-G"}),
# Spurious kwargs.
(None, {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G", "x": "y"}),
({"x": "y"}, {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G"}),
)
for dataId, kwds in variants:
with self.assertRaises(ValueError):
butler.get("flat", dataId=dataId, collections=coll, **kwds)

def testGetCalibration(self):
"""Test that `Butler.get` can be used to fetch from
`~CollectionType.CALIBRATION` collections if the data ID includes
Expand Down Expand Up @@ -476,6 +499,33 @@ def testGetCalibration(self):
)
self.assertEqual(bias3b_id, bias3b.id)

# Allow a fully-specified dataId and unnecessary extra information
# that comes from the record.
bias3b_id, _ = butler.get(
"bias",
dataId=dict(
exposure=4,
day_obs=20211114,
seq_num=42,
detector=3,
instrument="Cam1",
),
collections="calibs",
)
self.assertEqual(bias3b_id, bias3b.id)

# Extra but inconsistent record values are a problem.
with self.assertRaises(ValueError):
bias3b_id, _ = butler.get(
"bias",
exposure=3,
day_obs=20211114,
seq_num=42,
detector=3,
collections="calibs",
instrument="Cam1",
)

# Ensure that spurious kwargs cause an exception.
with self.assertRaises(ValueError):
butler.get(
Expand Down