Skip to content

Commit

Permalink
Add ability to defer loading datasets
Browse files Browse the repository at this point in the history
Add the ability for the butler to return a handle to a dataset
such that actual loading of the dataset can occur at a later time.
  • Loading branch information
natelust authored and TallJimbo committed Aug 29, 2019
1 parent ecf06f4 commit 03b8715
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 0 deletions.
30 changes: 30 additions & 0 deletions python/lsst/daf/butler/butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import contextlib
import logging
import itertools
import typing

try:
import boto3
Expand All @@ -38,6 +39,7 @@
from lsst.utils import doImport
from .core.utils import transactional
from .core.datasets import DatasetRef, DatasetType
from .core import deferredDatasetHandle as dDH
from .core.datastore import Datastore
from .core.registry import Registry
from .core.run import Run
Expand Down Expand Up @@ -446,6 +448,34 @@ def getDirect(self, ref, parameters=None):
# single entity in datastore
raise FileNotFoundError(f"Unable to locate dataset '{ref}' in datastore {self.datastore.name}")

def getDeferred(self, datasetRefOrType: typing.Union[DatasetRef, DatasetType, str],
dataId: typing.Union[dict, DataId] = None, parameters: typing.Union[dict, None] = None,
**kwds) -> dDH.DeferredDatasetHandle:
"""Create a `DeferredDatasetHandle` which can later retrieve a dataset
Parameters
----------
datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
When `DatasetRef` the `dataId` should be `None`.
Otherwise the `DatasetType` or name thereof.
dataId : `dict` or `DataId`
A `dict` of `Dimension` link name, value pairs that label the
`DatasetRef` within a Collection. When `None`, a `DatasetRef`
should be provided as the first argument.
parameters : `dict`
Additional StorageClass-defined options to control reading,
typically used to efficiently read only a subset of the dataset.
kwds
Additional keyword arguments used to augment or construct a
`DataId`. See `DataId` parameters.
Returns
-------
obj : `DeferredDatasetHandle`
A handle which can be used to retrieve a dataset at a later time
"""
return dDH.DeferredDatasetHandle(self, datasetRefOrType, dataId, parameters, kwds)

This comment has been minimized.

Copy link
@timj

timj Aug 29, 2019

Member

Is it worth having an option for doing a test for existence so that you can fail early if there is no dataset to retrieve?

This comment has been minimized.

Copy link
@TallJimbo

TallJimbo Aug 29, 2019

Member

I was also wondering about doing that and a bit more - doing the Registry-side lookup for the dataset_id up front as well. I'm a little worried about changing that now (I'm hoping to merge this ticket today) as I'm not completely certain we don't rely on that exception being deferred later (i.e. by catching it later), especially because that's what the behavior of the analogous object would be in Gen2, and I know these deferred handles are being used in places that are trying to juggle Gen2 and Gen3 inside the same block of algorithmic code. I think it's best to revisit this later.


def get(self, datasetRefOrType, dataId=None, parameters=None, **kwds):
"""Retrieve a stored dataset.
Expand Down
1 change: 1 addition & 0 deletions python/lsst/daf/butler/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@
from .dimensions import *
from .databaseDict import *
from .dataIdPacker import *
from .deferredDatasetHandle import *
114 changes: 114 additions & 0 deletions python/lsst/daf/butler/core/deferredDatasetHandle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""
Module containing classes used with deferring dataset loading
"""

__all__ = ("DeferredDatasetHandle",)

import dataclasses
import typing
import types

from .datasets import DatasetRef, DatasetType
from .dimensions import DataId

if typing.TYPE_CHECKING:
from ..butler import Butler


@dataclasses.dataclass(frozen=True)
class DeferredDatasetHandle:
"""This is a class to support deferred loading of a dataset from a butler.
Parameters
----------
butler : `Butler`
The butler that will be used to fetch the deferred dataset
datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
When `DatasetRef` the `dataId` should be `None`.
Otherwise the `DatasetType` or name thereof.
dataId : `dict` or `DataId`
A `dict` of `Dimension` link name, value pairs that label the
`DatasetRef` within a Collection. When `None`, a `DatasetRef`
should be provided as the first argument.
parameters : `dict`
Additional StorageClass-defined options to control reading,
typically used to efficiently read only a subset of the dataset.
kwds : `dict`
Additional keyword arguments used to augment or construct a
`DataId`. See `DataId` construction parameters.
"""

datasetRefOrType: typing.Union[DatasetRef, DatasetType, str]
dataId: typing.Union[dict, DataId]
parameters: typing.Union[dict, None]
kwds: dict

def __init__(self, butler: 'Butler', datasetRefOrType: typing.Union[DatasetRef, DatasetType, str],
dataId: typing.Union[dict, DataId], parameters: typing.Union[dict, None], kwds: dict):
object.__setattr__(self, 'datasetRefOrType', datasetRefOrType)
object.__setattr__(self, 'dataId', dataId)
object.__setattr__(self, 'parameters', parameters)
object.__setattr__(self, 'kwds', kwds)

# Closure over butler to discourage accessing a raw butler through a
# deferred handle
def _get(self, parameters: typing.Union[None, dict]) -> typing.Any:
return butler.get(self.datasetRefOrType, self.dataId, parameters, **self.kwds)

object.__setattr__(self, '_get', types.MethodType(_get, self))

def get(self, parameters: typing.Union[None, dict] = None, **kwargs: dict) -> typing.Any:
""" Retrieves the dataset pointed to by this handle
This handle may be used multiple times, possibly with different
parameters.
Parameters
----------
parameters : `dict` or None
The parameters argument will be passed to the butler get method.
It defaults to None. If the value is not None, this dict will
be merged with the parameters dict used to construct the
`DeferredDatasetHandle` class.
kwargs : `dict`
This argument is deprecated and only exists to support legacy
gen2 butler code during migration. It is completely ignored
and will be removed in the future.
Returns
-------
return : `Object`
The dataset pointed to by this handle
"""
if self.parameters is not None:
mergedParameters = self.parameters.copy()
if parameters is not None:
mergedParameters.update(parameters)
elif parameters is not None:
mergedParameters = parameters
else:
mergedParameters = {}

return self._get(mergedParameters)

0 comments on commit 03b8715

Please sign in to comment.