Skip to content

Commit

Permalink
Merge pull request #635 from lsst/tickets/DM-33303
Browse files Browse the repository at this point in the history
DM-33303: Allow DatasetType to differ to registry on get/put
  • Loading branch information
timj committed Jan 21, 2022
2 parents 580e572 + b31a2de commit 9aeb7c1
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 20 deletions.
1 change: 1 addition & 0 deletions doc/changes/DM-33303.misc.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
If a `DatasetType` has been constructed that differs from the registry definition, but in a way that is compatible through `StorageClass` conversion, then using that in a `Butler.get()` call will return a python type that matches the user-specified `StorageClass` instead of the internal python type.
39 changes: 34 additions & 5 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,7 @@ def _standardizeArgs(
self,
datasetRefOrType: Union[DatasetRef, DatasetType, str],
dataId: Optional[DataId] = None,
for_put: bool = True,
**kwargs: Any,
) -> Tuple[DatasetType, Optional[DataId]]:
"""Standardize the arguments passed to several Butler APIs.
Expand All @@ -571,6 +572,11 @@ def _standardizeArgs(
A `dict` of `Dimension` link name, value pairs that label the
`DatasetRef` within a Collection. When `None`, a `DatasetRef`
should be provided as the second argument.
for_put : `bool`, optional
If `True` this call is invoked as part of a `Butler.put()`.
Otherwise it is assumed to be part of a `Butler.get()`. This
parameter is only relevant if there is dataset type
inconsistency.
**kwargs
Additional keyword arguments used to augment or construct a
`DataCoordinate`. See `DataCoordinate.standardize`
Expand Down Expand Up @@ -615,10 +621,25 @@ def _standardizeArgs(
if externalDatasetType is not None:
internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
if externalDatasetType != internalDatasetType:
raise ValueError(
f"Supplied dataset type ({externalDatasetType}) inconsistent with "
f"registry definition ({internalDatasetType})"
)
# We can allow differences if they are compatible, depending
# on whether this is a get or a put. A get requires that
# the python type associated with the datastore can be
# converted to the user type. A put requires that the user
# supplied python type can be converted to the internal
# type expected by registry.
relevantDatasetType = internalDatasetType
if for_put:
is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
else:
is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
relevantDatasetType = externalDatasetType
if not is_compatible:
raise ValueError(
f"Supplied dataset type ({externalDatasetType}) inconsistent with "
f"registry definition ({internalDatasetType})"
)
# Override the internal definition.
internalDatasetType = relevantDatasetType

assert internalDatasetType is not None
return internalDatasetType, dataId
Expand Down Expand Up @@ -973,7 +994,7 @@ def _findDatasetRef(
TypeError
Raised if no collections were provided.
"""
datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
if isinstance(datasetRefOrType, DatasetRef):
idNumber = datasetRefOrType.id
else:
Expand Down Expand Up @@ -1021,6 +1042,14 @@ def _findDatasetRef(
f"DatasetRef.id provided ({idNumber}) does not match "
f"id ({ref.id}) in registry in collections {collections}."
)
if datasetType != ref.datasetType:
# If they differ it is because the user explicitly specified
# a compatible dataset type to this call rather than using the
# registry definition. The DatasetRef must therefore be recreated
# using the user definition such that the expected type is
# returned.
ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)

return ref

@transactional
Expand Down
2 changes: 2 additions & 0 deletions python/lsst/daf/butler/configs/storageClasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ storageClasses:
pytype: int
StructuredDataDict:
pytype: dict
converters:
lsst.pipe.base.TaskMetadata: lsst.pipe.base.TaskMetadata.to_dict
StructuredDataList:
pytype: list
TablePersistable:
Expand Down
18 changes: 16 additions & 2 deletions python/lsst/daf/butler/core/storageClass.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,12 +443,26 @@ def can_convert(self, other: StorageClass) -> bool:
# Identical storage classes are compatible.
return True

if issubclass(other.pytype, self.pytype):
# It may be that the storage class being compared is not
# available because the python type can't be imported. In that
# case conversion must be impossible.
try:
other_pytype = other.pytype
except Exception:
return False

# Or even this storage class itself can not have the type imported.
try:
self_pytype = self.pytype
except Exception:
return False

if issubclass(other_pytype, self_pytype):
# Storage classes have different names but the same python type.
return True

for candidate_type in self.converters_by_type:
if issubclass(other.pytype, candidate_type):
if issubclass(other_pytype, candidate_type):
return True
return False

Expand Down
5 changes: 3 additions & 2 deletions python/lsst/daf/butler/formatters/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,10 @@ def _assembleDataset(self, data: Any, component: Optional[str] = None) -> Any:
# for this pytype.
if not readStorageClass.can_convert(fileDescriptor.storageClass):
raise ValueError(
f"Storage class inconsistency ({readStorageClass.name,} vs"
f"Storage class inconsistency ({readStorageClass.name} vs"
f" {fileDescriptor.storageClass.name}) but no"
" component requested or converter registered"
" component requested or converter registered for"
f" python type {type(data)}"
)
else:
# Concrete composite written as a single file (we hope)
Expand Down
2 changes: 2 additions & 0 deletions tests/config/basic/storageClasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ storageClasses:
- slice
StructuredDataDictJson:
pytype: dict
converters:
lsst.daf.butler.tests.MetricsExample: lsst.daf.butler.tests.MetricsExample.exportAsDict
StructuredDataListJson:
pytype: list
StructuredDataDictPickle:
Expand Down
46 changes: 35 additions & 11 deletions tests/test_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,17 +188,18 @@ def create_butler(self, run, storageClass, datasetTypeName):
},
)

# Add a second visit for some later tests
butler.registry.insertDimensionData(
"visit",
{
"instrument": "DummyCamComp",
"id": 424,
"name": "fourtwentyfour",
"physical_filter": "d-r",
"visit_system": 1,
},
)
# Add more visits for some later tests
for visit_id in (424, 425):
butler.registry.insertDimensionData(
"visit",
{
"instrument": "DummyCamComp",
"id": visit_id,
"name": f"fourtwentyfour_{visit_id}",
"physical_filter": "d-r",
"visit_system": 1,
},
)
return butler, datasetType

def runPutGetTest(self, storageClass, datasetTypeName):
Expand Down Expand Up @@ -1387,6 +1388,29 @@ def testPytypePutCoercion(self):
self.assertEqual(test_metric.summary, test_dict["summary"])
self.assertEqual(test_metric.output, test_dict["output"])

# Check that the put still works if a DatasetType is given with
# a definition matching this python type.
registry_type = butler.registry.getDatasetType(datasetTypeName)
this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
self.assertEqual(metric2_ref.datasetType, registry_type)

# The get will return the type expected by registry.
test_metric2 = butler.getDirect(metric2_ref)
self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")

# Make a new DatasetRef with the compatible but different DatasetType.
# This should now return a dict.
new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
test_dict2 = butler.getDirect(new_ref)
self.assertEqual(get_full_type_name(test_dict2), "dict")

# Get it again with the wrong dataset type definition using get()
# rather than getDirect(). This should be consistent with getDirect()
# behavior and return the type of the DatasetType.
test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
self.assertEqual(get_full_type_name(test_dict3), "dict")

def testPytypeCoercion(self):
"""Test python type coercion on Butler.get and put."""

Expand Down

0 comments on commit 9aeb7c1

Please sign in to comment.