Skip to content

Commit

Permalink
Merge pull request #440 from lsst/tickets/DM-27152
Browse files Browse the repository at this point in the history
DM-27152: Allow alternate values in dimensions and also unadorned dimension record columns
  • Loading branch information
timj committed Nov 24, 2020
2 parents 62110f6 + 5f7e702 commit 3f99bad
Show file tree
Hide file tree
Showing 2 changed files with 201 additions and 1 deletion.
128 changes: 127 additions & 1 deletion python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@
)


from collections import defaultdict
from collections import defaultdict, Counter
import contextlib
import logging
import numbers
import os
from typing import (
Any,
Expand Down Expand Up @@ -616,6 +617,131 @@ def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
else:
newDataId[k] = v

# Go through the updated dataId and check the type in case someone is
# using an alternate key. We have already filtered out the compound
# keys dimensions.record format.
not_dimensions = {}

# Will need to look in the dataId and the keyword arguments
# and will remove them if they need to be fixed or are unrecognized.
for dataIdDict in (newDataId, kwds):
# Use a list so we can adjust the dict safely in the loop
for dimensionName in list(dataIdDict):
value = dataIdDict[dimensionName]
try:
dimension = self.registry.dimensions[dimensionName]
except KeyError:
# This is not a real dimension
not_dimensions[dimensionName] = value
del dataIdDict[dimensionName]
continue

# Convert an integral type to an explicit int to simplify
# comparisons here
if isinstance(value, numbers.Integral):
value = int(value)

if not isinstance(value, dimension.primaryKey.getPythonType()):
for alternate in dimension.alternateKeys:
if isinstance(value, alternate.getPythonType()):
byRecord[dimensionName][alternate.name] = value
del dataIdDict[dimensionName]
log.debug("Converting dimension %s to %s.%s=%s",
dimensionName, dimensionName, alternate.name, value)
break
else:
log.warning("Type mismatch found for value '%r' provided for dimension %s. "
"Could not find matching alternative (primary key has type %s) "
"so attempting to use as-is.",
value, dimensionName, dimension.primaryKey.getPythonType())

# If we have some unrecognized dimensions we have to try to connect
# them to records in other dimensions. This is made more complicated
# by some dimensions having records with clashing names. A mitigation
# is that we can tell by this point which dimensions are missing
# for the DatasetType but this does not work for calibrations
# where additional dimensions can be used to constrain the temporal
# axis.
if not_dimensions:
# Calculate missing dimensions
provided = set(newDataId) | set(kwds) | set(byRecord)
missingDimensions = datasetType.dimensions.names - provided

# For calibrations we may well be needing temporal dimensions
# so rather than always including all dimensions in the scan
# restrict things a little. It is still possible for there
# to be confusion over day_obs in visit vs exposure for example.
# If we are not searching calibration collections things may
# fail but they are going to fail anyway because of the
# ambiguousness of the dataId...
candidateDimensions = set()
candidateDimensions.update(missingDimensions)
if datasetType.isCalibration():
for dim in self.registry.dimensions.getStaticDimensions():
if dim.temporal:
candidateDimensions.add(str(dim))

# Look up table for the first association with a dimension
guessedAssociation: dict[Any, dict[str, Any]] = defaultdict(dict)

# Keep track of whether an item is associated with multiple
# dimensions.
counter = Counter()
assigned: dict[Any, Set[str]] = defaultdict(set)

# Go through the missing dimensions and associate the
# given names with records within those dimensions
for dimensionName in candidateDimensions:
dimension = self.registry.dimensions[dimensionName]
fields = dimension.metadata | dimension.uniqueKeys
for field in not_dimensions:
if field in fields:
guessedAssociation[dimensionName][field] = not_dimensions[field]
counter[dimensionName] += 1
assigned[field].add(dimensionName)

# There is a chance we have allocated a single dataId item
# to multiple dimensions. Need to decide which should be retained.
# For now assume that the most popular alternative wins.
# This means that day_obs with seq_num will result in
# exposure.day_obs and not visit.day_obs
# Also prefer an explicitly missing dimension over an inferred
# temporal dimension.
for fieldName, assignedDimensions in assigned.items():
if len(assignedDimensions) > 1:
# Pick the most popular (preferring mandatory dimensions)
requiredButMissing = assignedDimensions.intersection(missingDimensions)
if requiredButMissing:
candidateDimensions = requiredButMissing
else:
candidateDimensions = assignedDimensions

# Select the relevant items and get a new restricted
# counter.
theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
duplicatesCounter = Counter()
duplicatesCounter.update(theseCounts)

# Choose the most common. If they are equally common
# we will pick the one that was found first.
# Returns a list of tuples
selected = duplicatesCounter.most_common(1)[0][0]

log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
" Removed ambiguity by choosing dimension %s.",
fieldName, ", ".join(assignedDimensions), selected)

for candidateDimension in assignedDimensions:
if candidateDimension != selected:
del guessedAssociation[candidateDimension][fieldName]

# Update the record look up dict with the new associations
for dimensionName, values in guessedAssociation.items():
if values: # A dict might now be empty
log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
dimensionName, values)
byRecord[dimensionName].update(values)

if byRecord:
# Some record specifiers were found so we need to convert
# them to the Id form
Expand Down
74 changes: 74 additions & 0 deletions tests/test_simpleButler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
from typing import Any
import unittest

try:
import numpy as np
except ImportError:
np = None

import astropy.time

from lsst.daf.butler import (
Expand Down Expand Up @@ -219,6 +224,51 @@ def testCollectionTransfers(self):
for assoc in registry2.queryDatasetAssociations("bias", collections="calibration1")],
)

def testButlerGet(self):
"""Test that butler.get can work with different variants."""

# Import data to play with.
butler = self.makeButler(writeable=True)
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "datasets.yaml"))

# Find the DatasetRef for a flat
coll = "imported_g"
flat2g = butler.registry.findDataset("flat", instrument="Cam1", detector=2, physical_filter="Cam1-G",
collections=coll)

# Create a numpy integer to check that works fine
detector_np = np.int64(2) if np else 2
print(type(detector_np))

# Try to get it using different variations of dataId + keyword
# arguments
# Note that instrument.class_name does not work
variants = (
(None, {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G"}),
(None, {"instrument": "Cam1", "detector": detector_np, "physical_filter": "Cam1-G"}),
({"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G"}, {}),
({"instrument": "Cam1", "detector": detector_np, "physical_filter": "Cam1-G"}, {}),
({"instrument": "Cam1", "detector": 2}, {"physical_filter": "Cam1-G"}),
({"detector.full_name": "Ab"}, {"instrument": "Cam1", "physical_filter": "Cam1-G"}),
({"full_name": "Ab"}, {"instrument": "Cam1", "physical_filter": "Cam1-G"}),
(None, {"full_name": "Ab", "instrument": "Cam1", "physical_filter": "Cam1-G"}),
({"name_in_raft": "b", "raft": "A"}, {"instrument": "Cam1", "physical_filter": "Cam1-G"}),
({"name_in_raft": "b"}, {"raft": "A", "instrument": "Cam1", "physical_filter": "Cam1-G"}),
(None, {"name_in_raft": "b", "raft": "A", "instrument": "Cam1", "physical_filter": "Cam1-G"}),
({"detector.name_in_raft": "b", "detector.raft": "A"},
{"instrument": "Cam1", "physical_filter": "Cam1-G"}),
({"detector.name_in_raft": "b", "detector.raft": "A",
"instrument": "Cam1", "physical_filter": "Cam1-G"}, {}),
)

for dataId, kwds in variants:
try:
flat_id, _ = butler.get("flat", dataId=dataId, collections=coll, **kwds)
except Exception as e:
raise type(e)(f"{str(e)}: dataId={dataId}, kwds={kwds}") from e
self.assertEqual(flat_id, flat2g.id, msg=f"DataId: {dataId}, kwds: {kwds}")

def testGetCalibration(self):
"""Test that `Butler.get` can be used to fetch from
`~CollectionType.CALIBRATION` collections if the data ID includes
Expand Down Expand Up @@ -250,13 +300,17 @@ def testGetCalibration(self):
"obs_id": "three",
"timespan": Timespan(t1, t2),
"physical_filter": "Cam1-G",
"day_obs": 20201114,
"seq_num": 55,
},
{
"instrument": "Cam1",
"id": 4,
"obs_id": "four",
"timespan": Timespan(t2, t3),
"physical_filter": "Cam1-G",
"day_obs": 20211114,
"seq_num": 42,
},
)
# Get some biases from raw-like data IDs.
Expand All @@ -283,6 +337,26 @@ def testGetCalibration(self):
collections="calibs", instrument="Cam1")
self.assertEqual(bias3b_id, bias3b.id)

# And again but this time using the alternate value rather than
# the primary.
bias3b_id, _ = butler.get("bias", {"exposure": "four",
"detector": "Ba"},
collections="calibs", instrument="Cam1")
self.assertEqual(bias3b_id, bias3b.id)

# And again but this time using the alternate value rather than
# the primary and do it in the keyword arguments.
bias3b_id, _ = butler.get("bias",
exposure="four", detector="Ba",
collections="calibs", instrument="Cam1")
self.assertEqual(bias3b_id, bias3b.id)

# Now with implied record columns
bias3b_id, _ = butler.get("bias", day_obs=20211114, seq_num=42,
raft="B", name_in_raft="a",
collections="calibs", instrument="Cam1")
self.assertEqual(bias3b_id, bias3b.id)


if __name__ == "__main__":
unittest.main()

0 comments on commit 3f99bad

Please sign in to comment.