Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-4551: Allow override of storage class on butler.get() #737

Merged
merged 17 commits into from
Oct 4, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/changes/DM-4551.api.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
A method has been added to ``DatasetRef`` and ``DatasetType``, named ``overrideStorageClass``, to allow a new object to be created that has a different storage class associated with it.
9 changes: 9 additions & 0 deletions doc/changes/DM-4551.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
It is now possible to override the python type returned by ``butler.get()`` (if the types are compatible with each other) by using the new ``readStorageClass`` parameter.

For example, to return an `astropy.table.Table` from something that usually returns an ``lsst.afw.table.Catalog`` you would do:

.. code-block:: python

table = butler.getDirect(ref, readStorageClass="AstropyTable")

Any parameters given to the ``get()`` must still refer to the native storage class.
49 changes: 43 additions & 6 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
DimensionUniverse,
FileDataset,
Progress,
StorageClass,
StorageClassFactory,
Timespan,
ValidationError,
Expand Down Expand Up @@ -1229,7 +1230,13 @@ def put(

return ref

def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
def getDirect(
self,
ref: DatasetRef,
*,
parameters: Optional[Dict[str, Any]] = None,
storageClass: Optional[Union[StorageClass, str]] = None,
) -> Any:
"""Retrieve a stored dataset.

Unlike `Butler.get`, this method allows datasets outside the Butler's
Expand All @@ -1243,16 +1250,26 @@ def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = N
parameters : `dict`
Additional StorageClass-defined options to control reading,
typically used to efficiently read only a subset of the dataset.
storageClass : `StorageClass` or `str`, optional
The storage class to be used to override the Python type
returned by this method. By default the returned type matches
the dataset type definition for this dataset. Specifying a
read `StorageClass` can force a different type to be returned.
This type must be compatible with the original type.

Returns
-------
obj : `object`
The dataset.
"""
return self.datastore.get(ref, parameters=parameters)
return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)

def getDirectDeferred(
timj marked this conversation as resolved.
Show resolved Hide resolved
self, ref: DatasetRef, *, parameters: Union[dict, None] = None
self,
ref: DatasetRef,
*,
parameters: Union[dict, None] = None,
storageClass: str | StorageClass | None = None,
) -> DeferredDatasetHandle:
"""Create a `DeferredDatasetHandle` which can later retrieve a dataset,
from a resolved `DatasetRef`.
Expand All @@ -1264,6 +1281,12 @@ def getDirectDeferred(
parameters : `dict`
Additional StorageClass-defined options to control reading,
typically used to efficiently read only a subset of the dataset.
storageClass : `StorageClass` or `str`, optional
The storage class to be used to override the Python type
returned by this method. By default the returned type matches
the dataset type definition for this dataset. Specifying a
read `StorageClass` can force a different type to be returned.
This type must be compatible with the original type.

Returns
-------
Expand All @@ -1279,7 +1302,7 @@ def getDirectDeferred(
raise AmbiguousDatasetError(
f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
)
return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)

def getDeferred(
self,
Expand All @@ -1288,6 +1311,7 @@ def getDeferred(
*,
parameters: Union[dict, None] = None,
collections: Any = None,
storageClass: str | StorageClass | None = None,
**kwargs: Any,
) -> DeferredDatasetHandle:
"""Create a `DeferredDatasetHandle` which can later retrieve a dataset,
Expand All @@ -1309,6 +1333,12 @@ def getDeferred(
Collections to be searched, overriding ``self.collections``.
Can be any of the types supported by the ``collections`` argument
to butler construction.
storageClass : `StorageClass` or `str`, optional
The storage class to be used to override the Python type
returned by this method. By default the returned type matches
the dataset type definition for this dataset. Specifying a
read `StorageClass` can force a different type to be returned.
This type must be compatible with the original type.
**kwargs
Additional keyword arguments used to augment or construct a
`DataId`. See `DataId` parameters.
Expand All @@ -1330,7 +1360,7 @@ def getDeferred(
Raised if no collections were provided.
"""
ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)

def get(
self,
Expand All @@ -1339,6 +1369,7 @@ def get(
*,
parameters: Optional[Dict[str, Any]] = None,
collections: Any = None,
storageClass: Optional[Union[StorageClass, str]] = None,
**kwargs: Any,
) -> Any:
"""Retrieve a stored dataset.
Expand All @@ -1359,6 +1390,12 @@ def get(
Collections to be searched, overriding ``self.collections``.
Can be any of the types supported by the ``collections`` argument
to butler construction.
storageClass : `StorageClass` or `str`, optional
The storage class to be used to override the Python type
returned by this method. By default the returned type matches
the dataset type definition for this dataset. Specifying a
read `StorageClass` can force a different type to be returned.
This type must be compatible with the original type.
**kwargs
Additional keyword arguments used to augment or construct a
`DataCoordinate`. See `DataCoordinate.standardize`
Expand Down Expand Up @@ -1391,7 +1428,7 @@ def get(
"""
log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
return self.getDirect(ref, parameters=parameters)
return self.getDirect(ref, parameters=parameters, storageClass=storageClass)

def getURIs(
self,
Expand Down
25 changes: 21 additions & 4 deletions python/lsst/daf/butler/_deferredDatasetHandle.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,24 @@
__all__ = ("DeferredDatasetHandle",)

import dataclasses
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any, Optional, Union

if TYPE_CHECKING:
from ._limited_butler import LimitedButler
from .core import DataCoordinate, DatasetRef
from .core import DataCoordinate, DatasetRef, StorageClass


@dataclasses.dataclass(frozen=True)
class DeferredDatasetHandle:
"""Proxy class that provides deferred loading of datasets from a butler."""

def get(
self, *, component: Optional[str] = None, parameters: Optional[dict] = None, **kwargs: dict
self,
*,
component: Optional[str] = None,
parameters: Optional[dict] = None,
storageClass: str | StorageClass | None = None,
**kwargs: dict,
) -> Any:
"""Retrieves the dataset pointed to by this handle

Expand All @@ -56,6 +61,13 @@ def get(
It defaults to None. If the value is not None, this dict will
be merged with the parameters dict used to construct the
`DeferredDatasetHandle` class.
storageClass : `StorageClass` or `str`, optional
The storage class to be used to override the Python type
returned by this method. By default the returned type matches
the dataset type definition for this dataset or the storage
class specified when this object was created. Specifying a
read `StorageClass` can force a different type to be returned.
This type must be compatible with the original type.
**kwargs
This argument is deprecated and only exists to support legacy
gen2 butler code during migration. It is completely ignored
Expand All @@ -74,9 +86,11 @@ def get(
mergedParameters = parameters
else:
mergedParameters = {}
if storageClass is None:
storageClass = self.storageClass

ref = self.ref.makeComponentRef(component) if component is not None else self.ref
return self.butler.getDirect(ref, parameters=mergedParameters)
return self.butler.getDirect(ref, parameters=mergedParameters, storageClass=storageClass)

@property
def dataId(self) -> DataCoordinate:
Expand All @@ -99,3 +113,6 @@ def dataId(self) -> DataCoordinate:
"""Optional parameters that may be used to specify a subset of the dataset
to be loaded (`dict` or `None`).
"""

storageClass: Optional[Union[str, StorageClass]] = None
"""Optional storage class override that can be applied on ``get()``."""
39 changes: 34 additions & 5 deletions python/lsst/daf/butler/_limited_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,14 @@
from typing import Any, ClassVar, Dict, Iterable, Optional, Union

from ._deferredDatasetHandle import DeferredDatasetHandle
from .core import AmbiguousDatasetError, DatasetRef, Datastore, DimensionUniverse, StorageClassFactory
from .core import (
AmbiguousDatasetError,
DatasetRef,
Datastore,
DimensionUniverse,
StorageClass,
StorageClassFactory,
)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -84,7 +91,13 @@ def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
"""
raise NotImplementedError()

def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
def getDirect(
self,
ref: DatasetRef,
*,
parameters: Optional[Dict[str, Any]] = None,
storageClass: str | StorageClass | None = None,
) -> Any:
"""Retrieve a stored dataset.

Unlike `Butler.get`, this method allows datasets outside the Butler's
Expand All @@ -98,6 +111,12 @@ def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = N
parameters : `dict`
Additional StorageClass-defined options to control reading,
typically used to efficiently read only a subset of the dataset.
storageClass : `StorageClass` or `str`, optional
The storage class to be used to override the Python type
returned by this method. By default the returned type matches
the dataset type definition for this dataset. Specifying a
read `StorageClass` can force a different type to be returned.
This type must be compatible with the original type.

Returns
-------
Expand All @@ -109,10 +128,14 @@ def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = N
AmbiguousDatasetError
Raised if ``ref.id is None``, i.e. the reference is unresolved.
"""
return self.datastore.get(ref, parameters=parameters)
return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)

def getDirectDeferred(
self, ref: DatasetRef, *, parameters: Union[dict, None] = None
self,
ref: DatasetRef,
*,
parameters: Union[dict, None] = None,
storageClass: str | StorageClass | None = None,
) -> DeferredDatasetHandle:
"""Create a `DeferredDatasetHandle` which can later retrieve a dataset,
from a resolved `DatasetRef`.
Expand All @@ -124,6 +147,12 @@ def getDirectDeferred(
parameters : `dict`
Additional StorageClass-defined options to control reading,
typically used to efficiently read only a subset of the dataset.
storageClass : `StorageClass` or `str`, optional
The storage class to be used to override the Python type
returned by this method. By default the returned type matches
the dataset type definition for this dataset. Specifying a
read `StorageClass` can force a different type to be returned.
This type must be compatible with the original type.

Returns
-------
Expand All @@ -139,7 +168,7 @@ def getDirectDeferred(
raise AmbiguousDatasetError(
f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
)
return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)

def datasetExistsDirect(self, ref: DatasetRef) -> bool:
"""Return `True` if a dataset is actually present in the Datastore.
Expand Down
19 changes: 15 additions & 4 deletions python/lsst/daf/butler/_quantum_backed.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
DimensionUniverse,
Quantum,
SerializedDatastoreRecordData,
StorageClass,
StorageClassFactory,
ddl,
)
Expand Down Expand Up @@ -235,10 +236,16 @@ def isWriteable(self) -> bool:
# Docstring inherited.
return True

def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
def getDirect(
self,
ref: DatasetRef,
*,
parameters: Optional[Dict[str, Any]] = None,
storageClass: str | StorageClass | None = None,
) -> Any:
# Docstring inherited.
try:
obj = super().getDirect(ref, parameters=parameters)
obj = super().getDirect(ref, parameters=parameters, storageClass=storageClass)
except (LookupError, FileNotFoundError, IOError):
self._unavailable_inputs.add(ref.getCheckedId())
raise
Expand All @@ -249,15 +256,19 @@ def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = N
return obj

def getDirectDeferred(
self, ref: DatasetRef, *, parameters: Union[dict, None] = None
self,
ref: DatasetRef,
*,
parameters: Union[dict, None] = None,
storageClass: str | StorageClass | None = None,
) -> DeferredDatasetHandle:
# Docstring inherited.
if ref.id in self._predicted_inputs:
# Unfortunately, we can't do this after the handle succeeds in
# loading, so it's conceivable here that we're marking an input
# as "actual" even when it's not even available.
self._actual_inputs.add(ref.id)
return super().getDirectDeferred(ref, parameters=parameters)
return super().getDirectDeferred(ref, parameters=parameters, storageClass=storageClass)

def datasetExistsDirect(self, ref: DatasetRef) -> bool:
# Docstring inherited.
Expand Down
2 changes: 2 additions & 0 deletions python/lsst/daf/butler/configs/storageClasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,8 @@ storageClasses:
- bbox
AstropyTable:
pytype: astropy.table.Table
converters:
lsst.afw.table.Catalog: lsst.afw.table.Catalog.asAstropy
AstropyQTable:
pytype: astropy.table.QTable
ExtendedPsf:
Expand Down
24 changes: 24 additions & 0 deletions python/lsst/daf/butler/core/datasets/ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

if TYPE_CHECKING:
from ...registry import Registry
from ..storageClass import StorageClass


class AmbiguousDatasetError(Exception):
Expand Down Expand Up @@ -567,6 +568,29 @@ def makeComponentRef(self, name: str) -> DatasetRef:
conform=False,
)

def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef:
"""Create a new `DatasetRef` from this one, but with a modified
`DatasetType` that has a different `StorageClass`.

Parameters
----------
storageClass : `str` or `StorageClass`
The new storage class.

Returns
-------
modified : `DatasetRef`
A new dataset reference that is the same as the current one but
with a different storage class in the `DatasetType`.
"""
return DatasetRef(
datasetType=self.datasetType.overrideStorageClass(storageClass),
dataId=self.dataId,
id=self.id,
run=self.run,
conform=False,
)

datasetType: DatasetType
"""The definition of this dataset (`DatasetType`).

Expand Down