Merge pull request #635 from lsst/tickets/DM-33303

DM-33303: Allow DatasetType to differ to registry on get/put
lsst · Jan 21, 2022 · 9aeb7c1 · 9aeb7c1
2 parents 580e572 + b31a2de
commit 9aeb7c1
Show file tree

Hide file tree

Showing 7 changed files with 93 additions and 20 deletions.
diff --git a/doc/changes/DM-33303.misc.rst b/doc/changes/DM-33303.misc.rst
@@ -0,0 +1 @@
+If a `DatasetType` has been constructed that differs from the registry definition, but in a way that is compatible through `StorageClass` conversion, then using that in a `Butler.get()` call will return a python type that matches the user-specified `StorageClass` instead of the internal python type.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -558,6 +558,7 @@ def _standardizeArgs(
         self,
         datasetRefOrType: Union[DatasetRef, DatasetType, str],
         dataId: Optional[DataId] = None,
+        for_put: bool = True,
         **kwargs: Any,
     ) -> Tuple[DatasetType, Optional[DataId]]:
         """Standardize the arguments passed to several Butler APIs.
@@ -571,6 +572,11 @@ def _standardizeArgs(
             A `dict` of `Dimension` link name, value pairs that label the
             `DatasetRef` within a Collection. When `None`, a `DatasetRef`
             should be provided as the second argument.
+        for_put : `bool`, optional
+            If `True` this call is invoked as part of a `Butler.put()`.
+            Otherwise it is assumed to be part of a `Butler.get()`. This
+            parameter is only relevant if there is dataset type
+            inconsistency.
         **kwargs
             Additional keyword arguments used to augment or construct a
             `DataCoordinate`.  See `DataCoordinate.standardize`
@@ -615,10 +621,25 @@ def _standardizeArgs(
         if externalDatasetType is not None:
             internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
             if externalDatasetType != internalDatasetType:
-                raise ValueError(
-                    f"Supplied dataset type ({externalDatasetType}) inconsistent with "
-                    f"registry definition ({internalDatasetType})"
-                )
+                # We can allow differences if they are compatible, depending
+                # on whether this is a get or a put. A get requires that
+                # the python type associated with the datastore can be
+                # converted to the user type. A put requires that the user
+                # supplied python type can be converted to the internal
+                # type expected by registry.
+                relevantDatasetType = internalDatasetType
+                if for_put:
+                    is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
+                else:
+                    is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
+                    relevantDatasetType = externalDatasetType
+                if not is_compatible:
+                    raise ValueError(
+                        f"Supplied dataset type ({externalDatasetType}) inconsistent with "
+                        f"registry definition ({internalDatasetType})"
+                    )
+                # Override the internal definition.
+                internalDatasetType = relevantDatasetType
 
         assert internalDatasetType is not None
         return internalDatasetType, dataId
@@ -973,7 +994,7 @@ def _findDatasetRef(
         TypeError
             Raised if no collections were provided.
         """
-        datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
+        datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
         if isinstance(datasetRefOrType, DatasetRef):
             idNumber = datasetRefOrType.id
         else:
@@ -1021,6 +1042,14 @@ def _findDatasetRef(
                 f"DatasetRef.id provided ({idNumber}) does not match "
                 f"id ({ref.id}) in registry in collections {collections}."
             )
+        if datasetType != ref.datasetType:
+            # If they differ it is because the user explicitly specified
+            # a compatible dataset type to this call rather than using the
+            # registry definition. The DatasetRef must therefore be recreated
+            # using the user definition such that the expected type is
+            # returned.
+            ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
+
         return ref
 
     @transactional

diff --git a/python/lsst/daf/butler/configs/storageClasses.yaml b/python/lsst/daf/butler/configs/storageClasses.yaml
@@ -3,6 +3,8 @@ storageClasses:
     pytype: int
   StructuredDataDict:
     pytype: dict
+    converters:
+      lsst.pipe.base.TaskMetadata: lsst.pipe.base.TaskMetadata.to_dict
   StructuredDataList:
     pytype: list
   TablePersistable:

diff --git a/python/lsst/daf/butler/core/storageClass.py b/python/lsst/daf/butler/core/storageClass.py
@@ -443,12 +443,26 @@ def can_convert(self, other: StorageClass) -> bool:
             # Identical storage classes are compatible.
             return True
 
-        if issubclass(other.pytype, self.pytype):
+        # It may be that the storage class being compared is not
+        # available because the python type can't be imported. In that
+        # case conversion must be impossible.
+        try:
+            other_pytype = other.pytype
+        except Exception:
+            return False
+
+        # Or even this storage class itself can not have the type imported.
+        try:
+            self_pytype = self.pytype
+        except Exception:
+            return False
+
+        if issubclass(other_pytype, self_pytype):
             # Storage classes have different names but the same python type.
             return True
 
         for candidate_type in self.converters_by_type:
-            if issubclass(other.pytype, candidate_type):
+            if issubclass(other_pytype, candidate_type):
                 return True
         return False
 

diff --git a/python/lsst/daf/butler/formatters/file.py b/python/lsst/daf/butler/formatters/file.py
@@ -112,9 +112,10 @@ def _assembleDataset(self, data: Any, component: Optional[str] = None) -> Any:
                 # for this pytype.
                 if not readStorageClass.can_convert(fileDescriptor.storageClass):
                     raise ValueError(
-                        f"Storage class inconsistency ({readStorageClass.name,} vs"
+                        f"Storage class inconsistency ({readStorageClass.name} vs"
                         f" {fileDescriptor.storageClass.name}) but no"
-                        " component requested or converter registered"
+                        " component requested or converter registered for"
+                        f" python type {type(data)}"
                     )
             else:
                 # Concrete composite written as a single file (we hope)

diff --git a/tests/config/basic/storageClasses.yaml b/tests/config/basic/storageClasses.yaml
@@ -9,6 +9,8 @@ storageClasses:
       - slice
   StructuredDataDictJson:
     pytype: dict
+    converters:
+      lsst.daf.butler.tests.MetricsExample: lsst.daf.butler.tests.MetricsExample.exportAsDict
   StructuredDataListJson:
     pytype: list
   StructuredDataDictPickle:

diff --git a/tests/test_butler.py b/tests/test_butler.py
@@ -188,17 +188,18 @@ def create_butler(self, run, storageClass, datasetTypeName):
             },
         )
 
-        # Add a second visit for some later tests
-        butler.registry.insertDimensionData(
-            "visit",
-            {
-                "instrument": "DummyCamComp",
-                "id": 424,
-                "name": "fourtwentyfour",
-                "physical_filter": "d-r",
-                "visit_system": 1,
-            },
-        )
+        # Add more visits for some later tests
+        for visit_id in (424, 425):
+            butler.registry.insertDimensionData(
+                "visit",
+                {
+                    "instrument": "DummyCamComp",
+                    "id": visit_id,
+                    "name": f"fourtwentyfour_{visit_id}",
+                    "physical_filter": "d-r",
+                    "visit_system": 1,
+                },
+            )
         return butler, datasetType
 
     def runPutGetTest(self, storageClass, datasetTypeName):
@@ -1387,6 +1388,29 @@ def testPytypePutCoercion(self):
         self.assertEqual(test_metric.summary, test_dict["summary"])
         self.assertEqual(test_metric.output, test_dict["output"])
 
+        # Check that the put still works if a DatasetType is given with
+        # a definition matching this python type.
+        registry_type = butler.registry.getDatasetType(datasetTypeName)
+        this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
+        metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
+        self.assertEqual(metric2_ref.datasetType, registry_type)
+
+        # The get will return the type expected by registry.
+        test_metric2 = butler.getDirect(metric2_ref)
+        self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
+
+        # Make a new DatasetRef with the compatible but different DatasetType.
+        # This should now return a dict.
+        new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
+        test_dict2 = butler.getDirect(new_ref)
+        self.assertEqual(get_full_type_name(test_dict2), "dict")
+
+        # Get it again with the wrong dataset type definition using get()
+        # rather than getDirect(). This should be consistent with getDirect()
+        # behavior and return the type of the DatasetType.
+        test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
+        self.assertEqual(get_full_type_name(test_dict3), "dict")
+
     def testPytypeCoercion(self):
         """Test python type coercion on Butler.get and put."""