Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-37214: Fix problem with JSON formatter when Pydantic is used but Dict requested #761

Merged
merged 6 commits into from
Dec 9, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
103 changes: 77 additions & 26 deletions python/lsst/daf/butler/formatters/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@

__all__ = ("FileFormatter",)

import builtins
import dataclasses
from abc import abstractmethod
from typing import TYPE_CHECKING, Any, Optional, Type

from lsst.daf.butler import Formatter
from lsst.utils.introspection import get_full_type_name

if TYPE_CHECKING:
from lsst.daf.butler import StorageClass
Expand Down Expand Up @@ -104,35 +105,84 @@ def _assembleDataset(self, data: Any, component: Optional[str] = None) -> Any:
"""
fileDescriptor = self.fileDescriptor

# if read and write storage classes differ, more work is required
# Get the read and write storage classes.
readStorageClass = fileDescriptor.readStorageClass
if readStorageClass != fileDescriptor.storageClass:
if component is None:
# This likely means that type conversion is required but
# it will be an error if no valid converter is available
# for this pytype.
if not readStorageClass.can_convert(fileDescriptor.storageClass):
raise ValueError(
f"Storage class inconsistency ({readStorageClass.name} vs"
f" {fileDescriptor.storageClass.name}) but no"
" component requested or converter registered for"
f" converting type {get_full_type_name(fileDescriptor.storageClass.pytype)}"
f" to {get_full_type_name(readStorageClass.pytype)}."
)
else:
# Concrete composite written as a single file (we hope)
try:
data = fileDescriptor.storageClass.delegate().getComponent(data, component)
except AttributeError:
# Defer the complaint
data = None

# Coerce to the requested type (not necessarily the type that was
# written)
data = self._coerceType(data, fileDescriptor.storageClass, readStorageClass)
writeStorageClass = fileDescriptor.storageClass

if component is not None:
# Requesting a component implies that we need to first ensure
# that the composite is the correct python type. Lie to the
# coercion routine since the read StorageClass is not relevant
# if we want the original.
data = self._coerceType(data, writeStorageClass, writeStorageClass)

# Concrete composite written as a single file (we hope)
# so try to get the component.
try:
data = fileDescriptor.storageClass.delegate().getComponent(data, component)
except AttributeError:
# Defer the complaint
data = None

# Update the write storage class to match that of the component.
# It should be safe to use the component storage class directly
# since that should match what was returned from getComponent
# (else we could create a temporary storage class guaranteed to
# match the python type we have).
writeStorageClass = writeStorageClass.allComponents()[component]

# Coerce to the requested type.
data = self._coerceType(data, writeStorageClass, readStorageClass)

return data

def _coerceBuiltinType(self, inMemoryDataset: Any, writeStorageClass: StorageClass) -> Any:
"""Coerce the supplied inMemoryDataset to the written python type if it
is currently a built-in type.

Parameters
----------
inMemoryDataset : `object`
Object to coerce to expected type.
writeStorageClass : `StorageClass`
Storage class used to serialize this data.

Returns
-------
inMemoryDataset : `object`
Object of expected type ``writeStorageClass.pytype``.
"""
# Only do anything if this is a built-in type and we did not originally
# store it as a builtin type.
if (
inMemoryDataset is not None
and not hasattr(builtins, writeStorageClass.pytype.__name__)
and hasattr(builtins, type(inMemoryDataset).__name__)
timj marked this conversation as resolved.
Show resolved Hide resolved
):
if not isinstance(inMemoryDataset, writeStorageClass.pytype):
# Try different ways of converting to the required type.
if hasattr(writeStorageClass.pytype, "parse_obj"):
# This is for a Pydantic model.
inMemoryDataset = writeStorageClass.pytype.parse_obj(inMemoryDataset)
elif dataclasses.is_dataclass(writeStorageClass.pytype):
# Dataclasses accept key/value parameters.
inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
elif isinstance(inMemoryDataset, dict):
if writeStorageClass.isComposite():
# Assume that this type can be constructed
# using the registered assembler from a dict.
inMemoryDataset = writeStorageClass.delegate().assemble(
inMemoryDataset, pytype=writeStorageClass.pytype
)
else:
# Unpack the dict and hope that works.
inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
else:
# Hope that we can pass the arguments in directly.
inMemoryDataset = writeStorageClass.pytype(inMemoryDataset)

return inMemoryDataset

def _coerceType(
self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass
) -> Any:
Expand All @@ -152,6 +202,7 @@ def _coerceType(
inMemoryDataset : `object`
Object of expected type ``readStorageClass.pytype``.
"""
inMemoryDataset = self._coerceBuiltinType(inMemoryDataset, writeStorageClass)
return readStorageClass.coerce_type(inMemoryDataset)

def read(self, component: Optional[str] = None) -> Any:
Expand Down
49 changes: 1 addition & 48 deletions python/lsst/daf/butler/formatters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,11 @@

__all__ = ("JsonFormatter",)

import builtins
import dataclasses
import json
from typing import TYPE_CHECKING, Any, Optional, Type
from typing import Any, Optional, Type

from .file import FileFormatter

if TYPE_CHECKING:
from lsst.daf.butler import StorageClass


class JsonFormatter(FileFormatter):
"""Formatter implementation for JSON files."""
Expand Down Expand Up @@ -130,45 +125,3 @@ def _toBytes(self, inMemoryDataset: Any) -> bytes:
if hasattr(inMemoryDataset, "_asdict"):
inMemoryDataset = inMemoryDataset._asdict()
return json.dumps(inMemoryDataset, ensure_ascii=False).encode()

def _coerceType(
self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass
) -> Any:
"""Coerce the supplied inMemoryDataset to the correct python type.

Parameters
----------
inMemoryDataset : `object`
Object to coerce to expected type.
writeStorageClass : `StorageClass`
Storage class used to serialize this data.
readStorageClass : `StorageClass`
Storage class requested as the outcome.

Returns
-------
inMemoryDataset : `object`
Object of expected type ``readStorageClass.pytype``.
"""
if inMemoryDataset is not None and not hasattr(builtins, readStorageClass.pytype.__name__):
if writeStorageClass.isComposite():
# We know we must be able to assemble the written
# storage class. Coerce later to the read type.
inMemoryDataset = writeStorageClass.delegate().assemble(
inMemoryDataset, pytype=writeStorageClass.pytype
)
elif not isinstance(inMemoryDataset, readStorageClass.pytype):
# JSON data are returned as simple python types.
# The content will match the written storage class.
# Pydantic models have their own scheme.
try:
inMemoryDataset = writeStorageClass.pytype.parse_obj(inMemoryDataset)
except AttributeError:
if dataclasses.is_dataclass(writeStorageClass.pytype):
# dataclasses accept key/value parameters.
inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
else:
# Hope that we can pass the arguments in directly
inMemoryDataset = writeStorageClass.pytype(inMemoryDataset)
# Coerce to the read storage class if necessary.
return readStorageClass.coerce_type(inMemoryDataset)
46 changes: 1 addition & 45 deletions python/lsst/daf/butler/formatters/yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,13 @@

__all__ = ("YamlFormatter",)

import builtins
import dataclasses
from typing import TYPE_CHECKING, Any, Optional, Type
from typing import Any, Optional, Type

import yaml

from .file import FileFormatter

if TYPE_CHECKING:
from lsst.daf.butler import StorageClass


class YamlFormatter(FileFormatter):
"""Formatter implementation for YAML files."""
Expand Down Expand Up @@ -166,43 +162,3 @@ def _toBytes(self, inMemoryDataset: Any) -> bytes:
else:
serialized = yaml.safe_dump(inMemoryDataset)
return serialized.encode()

def _coerceType(
self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass
) -> Any:
"""Coerce the supplied inMemoryDataset to the correct python type.

Parameters
----------
inMemoryDataset : `object`
Object to coerce to expected type.
writeStorageClass : `StorageClass`
Storage class used to serialize this data.
readStorageClass : `StorageClass`
Storage class requested as the outcome.

Returns
-------
inMemoryDataset : `object`
Object of expected type ``readStorageClass.pytype``.
"""
if inMemoryDataset is not None and not hasattr(builtins, readStorageClass.pytype.__name__):
if writeStorageClass.isComposite():
# We know that the write storage class should work,
# then we convert to read storage class.
inMemoryDataset = writeStorageClass.delegate().assemble(
inMemoryDataset, pytype=writeStorageClass.pytype
)
elif not isinstance(inMemoryDataset, readStorageClass.pytype):
if not isinstance(inMemoryDataset, writeStorageClass.pytype):
# This does not look like the written type or the required
# type. Try to cast it to the written type and then coerce
# to requested type.
if dataclasses.is_dataclass(writeStorageClass.pytype):
# dataclasses accept key/value parameters.
inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
else:
# Hope that we can pass the arguments in directly.
inMemoryDataset = writeStorageClass.pytype(inMemoryDataset)
# Coerce to the read storage class if necessary.
return readStorageClass.coerce_type(inMemoryDataset)
71 changes: 71 additions & 0 deletions python/lsst/daf/butler/tests/dict_convertible_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

__all__ = ()

from collections.abc import Mapping

from pydantic import BaseModel, Field


class DictConvertibleModel(BaseModel):
"""A pydantic model to/from dict conversion in which the dict
representation is intentionally different from pydantics' own dict
conversions.
"""

content: dict[str, str] = Field(default_factory=dict)
"""Content of the logical dict that this object converts to (`dict`).
"""

extra: str = Field(default="")
"""Extra content that is not included in the dict representation (`str`).
"""

@classmethod
def from_dict(cls, content: Mapping[str, str], extra: str = "from_dict") -> "DictConvertibleModel":
"""Construct an instance from a `dict`.

Parameters
----------
content : `~collections.abc.Mapping`
Content of the logical dict that this object converts to.
extra : `str`, optional
Extra content that is not included in the dict representation

Returns
-------
model : `DictConvertibleModel`
New model.
"""
return cls(content=dict(content), extra=extra)

def to_dict(self) -> dict[str, str]:
"""Convert the model to a dictionary.

Returns
-------
content : `dict`
Copy of ``self.content``.
"""
return self.content.copy()
1 change: 1 addition & 0 deletions tests/config/basic/formatters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ StructuredCompositeReadComp: lsst.daf.butler.tests.testFormatters.MetricsExample
StructuredCompositeReadCompNoDisassembly: lsst.daf.butler.tests.testFormatters.MetricsExampleFormatter
StructuredDataDataTest: lsst.daf.butler.tests.testFormatters.MetricsExampleDataFormatter
StructuredDataNoComponentsModel: lsst.daf.butler.formatters.json.JsonFormatter
DictConvertibleModel: lsst.daf.butler.formatters.json.JsonFormatter
12 changes: 12 additions & 0 deletions tests/config/basic/storageClasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,15 @@ storageClasses:
data: StructuredDataDataTestTuple
converters:
lsst.daf.butler.tests.MetricsExample: lsst.daf.butler.tests.MetricsExampleModel.from_metrics
DictConvertibleModel:
# Special storage class to test Pydantic model conversions to/from dict,
# when that's also what the formatter returns natively.
pytype: lsst.daf.butler.tests.dict_convertible_model.DictConvertibleModel
converters:
dict: lsst.daf.butler.tests.dict_convertible_model.DictConvertibleModel.from_dict
NativeDictForConvertibleModel:
# Variant of StructuredDataDict for tests with DictConvertibleModel, to
# avoid disruption from unexpected conversions in other storage classes.
pytype: dict
converters:
lsst.daf.butler.tests.dict_convertible_model.DictConvertibleModel: lsst.daf.butler.tests.dict_convertible_model.DictConvertibleModel.to_dict
20 changes: 20 additions & 0 deletions tests/test_datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import yaml
from lsst.daf.butler import (
Config,
DataCoordinate,
DatasetRef,
DatasetRefURIs,
DatasetTypeNotSupportedError,
Expand All @@ -59,6 +60,7 @@
DummyRegistry,
MetricsExample,
)
from lsst.daf.butler.tests.dict_convertible_model import DictConvertibleModel
from lsst.resources import ResourcePath
from lsst.utils import doImport

Expand Down Expand Up @@ -902,6 +904,24 @@ def testExport(self):
with self.assertRaises(FileNotFoundError):
list(datastore.export(refs + [ref], transfer=None))

def test_pydantic_dict_storage_class_conversions(self):
"""Test converting a dataset stored as a pydantic model into a dict on
read.
"""
datastore = self.makeDatastore()
store_as_model = self.makeDatasetRef(
"store_as_model",
dimensions=self.universe.empty,
storageClass="DictConvertibleModel",
dataId=DataCoordinate.makeEmpty(self.universe),
)
content = {"a": "one", "b": "two"}
model = DictConvertibleModel.from_dict(content, extra="original content")
datastore.put(model, store_as_model)
loaded = datastore.get(store_as_model.overrideStorageClass("NativeDictForConvertibleModel"))
self.assertEqual(type(loaded), dict)
self.assertEqual(loaded, content)


class PosixDatastoreTestCase(DatastoreTests, unittest.TestCase):
"""PosixDatastore specialization"""
Expand Down