lsst · timj · Dec 9, 2022 · Dec 3, 2022 · Dec 7, 2022 · Dec 8, 2022
diff --git a/python/lsst/daf/butler/formatters/file.py b/python/lsst/daf/butler/formatters/file.py
@@ -25,11 +25,12 @@
 
 __all__ = ("FileFormatter",)
 
+import builtins
+import dataclasses
 from abc import abstractmethod
 from typing import TYPE_CHECKING, Any, Optional, Type
 
 from lsst.daf.butler import Formatter
-from lsst.utils.introspection import get_full_type_name
 
 if TYPE_CHECKING:
     from lsst.daf.butler import StorageClass
@@ -104,35 +105,84 @@ def _assembleDataset(self, data: Any, component: Optional[str] = None) -> Any:
         """
         fileDescriptor = self.fileDescriptor
 
-        # if read and write storage classes differ, more work is required
+        # Get the read and write storage classes.
         readStorageClass = fileDescriptor.readStorageClass
-        if readStorageClass != fileDescriptor.storageClass:
-            if component is None:
-                # This likely means that type conversion is required but
-                # it will be an error if no valid converter is available
-                # for this pytype.
-                if not readStorageClass.can_convert(fileDescriptor.storageClass):
-                    raise ValueError(
-                        f"Storage class inconsistency ({readStorageClass.name} vs"
-                        f" {fileDescriptor.storageClass.name}) but no"
-                        " component requested or converter registered for"
-                        f" converting type {get_full_type_name(fileDescriptor.storageClass.pytype)}"
-                        f" to {get_full_type_name(readStorageClass.pytype)}."
-                    )
-            else:
-                # Concrete composite written as a single file (we hope)
-                try:
-                    data = fileDescriptor.storageClass.delegate().getComponent(data, component)
-                except AttributeError:
-                    # Defer the complaint
-                    data = None
-
-        # Coerce to the requested type (not necessarily the type that was
-        # written)
-        data = self._coerceType(data, fileDescriptor.storageClass, readStorageClass)
+        writeStorageClass = fileDescriptor.storageClass
+
+        if component is not None:
+            # Requesting a component implies that we need to first ensure
+            # that the composite is the correct python type. Lie to the
+            # coercion routine since the read StorageClass is not relevant
+            # if we want the original.
+            data = self._coerceType(data, writeStorageClass, writeStorageClass)
+
+            # Concrete composite written as a single file (we hope)
+            # so try to get the component.
+            try:
+                data = fileDescriptor.storageClass.delegate().getComponent(data, component)
+            except AttributeError:
+                # Defer the complaint
+                data = None
+
+            # Update the write storage class to match that of the component.
+            # It should be safe to use the component storage class directly
+            # since that should match what was returned from getComponent
+            # (else we could create a temporary storage class guaranteed to
+            # match the python type we have).
+            writeStorageClass = writeStorageClass.allComponents()[component]
+
+        # Coerce to the requested type.
+        data = self._coerceType(data, writeStorageClass, readStorageClass)
 
         return data
 
+    def _coerceBuiltinType(self, inMemoryDataset: Any, writeStorageClass: StorageClass) -> Any:
+        """Coerce the supplied inMemoryDataset to the written python type if it
+        is currently a built-in type.
+
+        Parameters
+        ----------
+        inMemoryDataset : `object`
+            Object to coerce to expected type.
+        writeStorageClass : `StorageClass`
+            Storage class used to serialize this data.
+
+        Returns
+        -------
+        inMemoryDataset : `object`
+            Object of expected type ``writeStorageClass.pytype``.
+        """
+        # Only do anything if this is a built-in type and we did not originally
+        # store it as a builtin type.
+        if (
+            inMemoryDataset is not None
+            and not hasattr(builtins, writeStorageClass.pytype.__name__)
+            and hasattr(builtins, type(inMemoryDataset).__name__)
+        ):
+            if not isinstance(inMemoryDataset, writeStorageClass.pytype):
+                # Try different ways of converting to the required type.
+                if hasattr(writeStorageClass.pytype, "parse_obj"):
+                    # This is for a Pydantic model.
+                    inMemoryDataset = writeStorageClass.pytype.parse_obj(inMemoryDataset)
+                elif dataclasses.is_dataclass(writeStorageClass.pytype):
+                    # Dataclasses accept key/value parameters.
+                    inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
+                elif isinstance(inMemoryDataset, dict):
+                    if writeStorageClass.isComposite():
+                        # Assume that this type can be constructed
+                        # using the registered assembler from a dict.
+                        inMemoryDataset = writeStorageClass.delegate().assemble(
+                            inMemoryDataset, pytype=writeStorageClass.pytype
+                        )
+                    else:
+                        # Unpack the dict and hope that works.
+                        inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
+                else:
+                    # Hope that we can pass the arguments in directly.
+                    inMemoryDataset = writeStorageClass.pytype(inMemoryDataset)
+
+        return inMemoryDataset
+
     def _coerceType(
         self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass
     ) -> Any:
@@ -152,6 +202,7 @@ def _coerceType(
         inMemoryDataset : `object`
             Object of expected type ``readStorageClass.pytype``.
         """
+        inMemoryDataset = self._coerceBuiltinType(inMemoryDataset, writeStorageClass)
         return readStorageClass.coerce_type(inMemoryDataset)
 
     def read(self, component: Optional[str] = None) -> Any:

diff --git a/python/lsst/daf/butler/formatters/json.py b/python/lsst/daf/butler/formatters/json.py
@@ -23,16 +23,11 @@
 
 __all__ = ("JsonFormatter",)
 
-import builtins
-import dataclasses
 import json
-from typing import TYPE_CHECKING, Any, Optional, Type
+from typing import Any, Optional, Type
 
 from .file import FileFormatter
 
-if TYPE_CHECKING:
-    from lsst.daf.butler import StorageClass
-
 
 class JsonFormatter(FileFormatter):
     """Formatter implementation for JSON files."""
@@ -130,45 +125,3 @@ def _toBytes(self, inMemoryDataset: Any) -> bytes:
         if hasattr(inMemoryDataset, "_asdict"):
             inMemoryDataset = inMemoryDataset._asdict()
         return json.dumps(inMemoryDataset, ensure_ascii=False).encode()
-
-    def _coerceType(
-        self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass
-    ) -> Any:
-        """Coerce the supplied inMemoryDataset to the correct python type.
-
-        Parameters
-        ----------
-        inMemoryDataset : `object`
-            Object to coerce to expected type.
-        writeStorageClass : `StorageClass`
-            Storage class used to serialize this data.
-        readStorageClass : `StorageClass`
-            Storage class requested as the outcome.
-
-        Returns
-        -------
-        inMemoryDataset : `object`
-            Object of expected type ``readStorageClass.pytype``.
-        """
-        if inMemoryDataset is not None and not hasattr(builtins, readStorageClass.pytype.__name__):
-            if writeStorageClass.isComposite():
-                # We know we must be able to assemble the written
-                # storage class. Coerce later to the read type.
-                inMemoryDataset = writeStorageClass.delegate().assemble(
-                    inMemoryDataset, pytype=writeStorageClass.pytype
-                )
-            elif not isinstance(inMemoryDataset, readStorageClass.pytype):
-                # JSON data are returned as simple python types.
-                # The content will match the written storage class.
-                # Pydantic models have their own scheme.
-                try:
-                    inMemoryDataset = writeStorageClass.pytype.parse_obj(inMemoryDataset)
-                except AttributeError:
-                    if dataclasses.is_dataclass(writeStorageClass.pytype):
-                        # dataclasses accept key/value parameters.
-                        inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
-                    else:
-                        # Hope that we can pass the arguments in directly
-                        inMemoryDataset = writeStorageClass.pytype(inMemoryDataset)
-        # Coerce to the read storage class if necessary.
-        return readStorageClass.coerce_type(inMemoryDataset)
diff --git a/python/lsst/daf/butler/formatters/yaml.py b/python/lsst/daf/butler/formatters/yaml.py
@@ -23,17 +23,13 @@
 
 __all__ = ("YamlFormatter",)
 
-import builtins
 import dataclasses
-from typing import TYPE_CHECKING, Any, Optional, Type
+from typing import Any, Optional, Type
 
 import yaml
 
 from .file import FileFormatter
 
-if TYPE_CHECKING:
-    from lsst.daf.butler import StorageClass
-
 
 class YamlFormatter(FileFormatter):
     """Formatter implementation for YAML files."""
@@ -166,43 +162,3 @@ def _toBytes(self, inMemoryDataset: Any) -> bytes:
         else:
             serialized = yaml.safe_dump(inMemoryDataset)
         return serialized.encode()
-
-    def _coerceType(
-        self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass
-    ) -> Any:
-        """Coerce the supplied inMemoryDataset to the correct python type.
-
-        Parameters
-        ----------
-        inMemoryDataset : `object`
-            Object to coerce to expected type.
-        writeStorageClass : `StorageClass`
-            Storage class used to serialize this data.
-        readStorageClass : `StorageClass`
-            Storage class requested as the outcome.
-
-        Returns
-        -------
-        inMemoryDataset : `object`
-            Object of expected type ``readStorageClass.pytype``.
-        """
-        if inMemoryDataset is not None and not hasattr(builtins, readStorageClass.pytype.__name__):
-            if writeStorageClass.isComposite():
-                # We know that the write storage class should work,
-                # then we convert to read storage class.
-                inMemoryDataset = writeStorageClass.delegate().assemble(
-                    inMemoryDataset, pytype=writeStorageClass.pytype
-                )
-            elif not isinstance(inMemoryDataset, readStorageClass.pytype):
-                if not isinstance(inMemoryDataset, writeStorageClass.pytype):
-                    # This does not look like the written type or the required
-                    # type. Try to cast it to the written type and then coerce
-                    # to requested type.
-                    if dataclasses.is_dataclass(writeStorageClass.pytype):
-                        # dataclasses accept key/value parameters.
-                        inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
-                    else:
-                        # Hope that we can pass the arguments in directly.
-                        inMemoryDataset = writeStorageClass.pytype(inMemoryDataset)
-        # Coerce to the read storage class if necessary.
-        return readStorageClass.coerce_type(inMemoryDataset)
diff --git a/python/lsst/daf/butler/tests/dict_convertible_model.py b/python/lsst/daf/butler/tests/dict_convertible_model.py
@@ -0,0 +1,71 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+__all__ = ()
+
+from collections.abc import Mapping
+
+from pydantic import BaseModel, Field
+
+
+class DictConvertibleModel(BaseModel):
+    """A pydantic model to/from dict conversion in which the dict
+    representation is intentionally different from pydantics' own dict
+    conversions.
+    """
+
+    content: dict[str, str] = Field(default_factory=dict)
+    """Content of the logical dict that this object converts to (`dict`).
+    """
+
+    extra: str = Field(default="")
+    """Extra content that is not included in the dict representation (`str`).
+    """
+
+    @classmethod
+    def from_dict(cls, content: Mapping[str, str], extra: str = "from_dict") -> "DictConvertibleModel":
+        """Construct an instance from a `dict`.
+
+        Parameters
+        ----------
+        content : `~collections.abc.Mapping`
+            Content of the logical dict that this object converts to.
+        extra : `str`, optional
+            Extra content that is not included in the dict representation
+
+        Returns
+        -------
+        model : `DictConvertibleModel`
+            New model.
+        """
+        return cls(content=dict(content), extra=extra)
+
+    def to_dict(self) -> dict[str, str]:
+        """Convert the model to a dictionary.
+
+        Returns
+        -------
+        content : `dict`
+            Copy of ``self.content``.
+        """
+        return self.content.copy()
diff --git a/tests/config/basic/formatters.yaml b/tests/config/basic/formatters.yaml
@@ -37,3 +37,4 @@ StructuredCompositeReadComp: lsst.daf.butler.tests.testFormatters.MetricsExample
 StructuredCompositeReadCompNoDisassembly: lsst.daf.butler.tests.testFormatters.MetricsExampleFormatter
 StructuredDataDataTest: lsst.daf.butler.tests.testFormatters.MetricsExampleDataFormatter
 StructuredDataNoComponentsModel: lsst.daf.butler.formatters.json.JsonFormatter
+DictConvertibleModel: lsst.daf.butler.formatters.json.JsonFormatter
diff --git a/tests/config/basic/storageClasses.yaml b/tests/config/basic/storageClasses.yaml
@@ -113,3 +113,15 @@ storageClasses:
       data: StructuredDataDataTestTuple
     converters:
       lsst.daf.butler.tests.MetricsExample: lsst.daf.butler.tests.MetricsExampleModel.from_metrics
+  DictConvertibleModel:
+    # Special storage class to test Pydantic model conversions to/from dict,
+    # when that's also what the formatter returns natively.
+    pytype: lsst.daf.butler.tests.dict_convertible_model.DictConvertibleModel
+    converters:
+      dict: lsst.daf.butler.tests.dict_convertible_model.DictConvertibleModel.from_dict
+  NativeDictForConvertibleModel:
+    # Variant of StructuredDataDict for tests with DictConvertibleModel, to
+    # avoid disruption from unexpected conversions in other storage classes.
+    pytype: dict
+    converters:
+      lsst.daf.butler.tests.dict_convertible_model.DictConvertibleModel: lsst.daf.butler.tests.dict_convertible_model.DictConvertibleModel.to_dict
diff --git a/tests/test_datastore.py b/tests/test_datastore.py
@@ -35,6 +35,7 @@
 import yaml
 from lsst.daf.butler import (
     Config,
+    DataCoordinate,
     DatasetRef,
     DatasetRefURIs,
     DatasetTypeNotSupportedError,
@@ -59,6 +60,7 @@
     DummyRegistry,
     MetricsExample,
 )
+from lsst.daf.butler.tests.dict_convertible_model import DictConvertibleModel
 from lsst.resources import ResourcePath
 from lsst.utils import doImport
 
@@ -902,6 +904,24 @@ def testExport(self):
         with self.assertRaises(FileNotFoundError):
             list(datastore.export(refs + [ref], transfer=None))
 
+    def test_pydantic_dict_storage_class_conversions(self):
+        """Test converting a dataset stored as a pydantic model into a dict on
+        read.
+        """
+        datastore = self.makeDatastore()
+        store_as_model = self.makeDatasetRef(
+            "store_as_model",
+            dimensions=self.universe.empty,
+            storageClass="DictConvertibleModel",
+            dataId=DataCoordinate.makeEmpty(self.universe),
+        )
+        content = {"a": "one", "b": "two"}
+        model = DictConvertibleModel.from_dict(content, extra="original content")
+        datastore.put(model, store_as_model)
+        loaded = datastore.get(store_as_model.overrideStorageClass("NativeDictForConvertibleModel"))
+        self.assertEqual(type(loaded), dict)
+        self.assertEqual(loaded, content)
+
 
 class PosixDatastoreTestCase(DatastoreTests, unittest.TestCase):
     """PosixDatastore specialization"""