Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-13840: Prepare Butler for composite work to begin #21

Merged
merged 26 commits into from
Mar 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1,341 changes: 1,046 additions & 295 deletions config/registry/default_schema.yaml

Large diffs are not rendered by default.

116 changes: 39 additions & 77 deletions python/lsst/daf/butler/butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,104 +69,66 @@ def __init__(self, config):
if self.run is None:
self.run = self.registry.makeRun(self.config['run'])

def getDirect(self, ref, parameters=None):
"""Load a `Dataset` or a slice thereof from a `DatasetRef`.

Unlike `Butler.get`, this method allows `Datasets` outside the Butler's `Collection` to be read as
long as the `DatasetRef` that identifies them can be obtained separately.
def put(self, obj, datasetType, dataId, producer=None):
"""Store and register a dataset.

Parameters
----------
ref : `DatasetRef`
A pointer to the `Dataset` to load.
parameters : `dict`
`StorageClass`-specific parameters that can be used to obtain a slice of the `Dataset`.
obj : `object`
The dataset.
datasetType : `DatasetType` instance or `str`
The `DatasetType`.
dataId : `dict`
An identifier with `DataUnit` names and values.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you referring to the class here because the keys are names that convert to instances of DataUnit classes?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. They are names of the tables. (e.g. {"camera" : "HSC", "visit" : 3}).

producer : `Quantum`, optional
The producer.

Returns
-------
inMemoryDataset : `InMemoryDataset`
The requested `Dataset`.
ref : `DatasetRef`
A reference to the stored dataset.
"""
parent = self.datastore.get(ref.uri, ref.datasetType.storageClass, parameters) if ref.uri else None
children = {name: self.datastore.get(childRef, parameters)
for name, childRef in ref.components.items()}
return ref.datasetType.storageClass.assemble(parent, children)
datasetType = self.registry.getDatasetType(datasetType)
ref = self.registry.addDataset(datasetType, dataId, run=self.run, producer=producer)
# self.datastore.put(obj, ref)
return ref

def get(self, ref, parameters=None):
"""Load a `Dataset` or a slice thereof from the Butler's `Collection`.
def getDirect(self, ref):
"""Retrieve a stored dataset.

Unlike `Butler.get`, this method allows datasets outside the Butler's collection to be read as
long as the `DatasetRef` that identifies them can be obtained separately.

Parameters
----------
ref : `DatasetRef`
The `Dataset` to retrieve.
parameters : `dict`
A dictionary of `StorageClass`-specific parameters that can be
used to obtain a slice of the `Dataset`.
Reference to an already stored dataset.

Returns
-------
dataset : `InMemoryDataset`
The requested `Dataset`.
obj : `object`
The dataset.
"""
ref = self.registry.find(self.run.collection, ref)
if ref:
return self.getDirect(ref, parameters)
else:
return None # No Dataset found
# Currently a direct pass-through to `Datastore.get` but this should
# change for composites.
return self.datastore.get(ref)

def put(self, ref, inMemoryDataset, producer=None):
"""Write a `Dataset`.
def get(self, datasetType, dataId):
"""Retrieve a stored dataset.

Parameters
----------
ref : `DatasetRef`
The `Dataset` being stored.
inMemoryDataset : `InMemoryDataset`
The `Dataset` to store.
producer : `Quantum`
The producer of this `Dataset`. May be ``None`` for some
`Registry` instances.
``producer.run`` must match ``self.config['run']``.
datasetType : `DatasetType` instance or `str`
The `DatasetType`.
dataId : `dict`
A `dict` of `DataUnit` name, value pairs that label the `DatasetRef`
within a Collection.

Returns
-------
datasetRef : `DatasetRef`
The registered (and stored) dataset.
"""
ref = self.registry.expand(ref)
run = self.run
assert(producer is None or run == producer.run)
storageHint = ref.makeStorageHint(run)
uri, components = self.datastore.put(inMemoryDataset, ref.datasetType.storageClass,
storageHint, ref.datasetType.name)
return self.registry.addDataset(ref, uri, components, producer=producer, run=run)

def markInputUsed(self, quantum, ref):
"""Mark a `Dataset` as having been "actually" (not just
predicted-to-be) used by a `Quantum`.

Parameters
----------
quantum : `Quantum`
The dependent `Quantum`.
ref : `DatasetRef`
The `Dataset` that is a true dependency of ``quantum``.
"""
ref = self.registry.find(self.run.collection, ref)
self.registry.markInputUsed(ref, quantum)

def unlink(self, *refs):
"""Remove dataset from collection.

Remove the `Dataset`\ s associated with the given `DatasetRef`\ s
from the `Butler`\ 's collection, and signal that they may be deleted
from storage if they are not referenced by any other collection.

Parameters
----------
refs : `list` of `DatasetRef`
List of refs for `Dataset`\ s to unlink.
obj : `object`
The dataset.
"""
refs = [self.registry.find(self.run.collection, ref) for ref in refs]
for ref in self.registry.disassociate(self.run.collection, refs, remove=True):
self.datastore.remove(ref.uri)
datasetType = self.registry.getDatasetType(datasetType)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getDatasetType only takes a str but this method is documented to provide either a str or a DatasetType.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should probably be pass-through (or perhaps fill in details) for DatasetType instances.

ref = self.registry.find(datasetType, dataId)
return self.getDirect(ref)
77 changes: 41 additions & 36 deletions python/lsst/daf/butler/core/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,51 +96,50 @@ class DatasetRef(object):
----------
datasetType : `DatasetType`
The `DatasetType` for this `Dataset`.
units : `dict`
dataId : `dict`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This documentation for a dataId is more explicit than those elsewhere.

Dictionary where the keys are `DataUnit` names and the values are
`DataUnit` instances.
`DataUnit` values.
id : `int`, optional
A unique identifier.
Normally set to `None` and assigned by `Registry`
"""

__slots__ = ("_type", "_producer", "_predictedConsumers", "_actualConsumers")
_currentId = -1
__slots__ = ("_id", "_datasetType", "_dataId", "_producer",
"_predictedConsumers", "_actualConsumers", "_components",
"_assembler")

@classmethod
def getNewId(cls):
"""Generate a new Dataset ID number.

..todo::
This is a temporary workaround that will probably disapear in
the future, when a solution is found to the problem of
autoincrement compound primary keys in SQLite.
"""
cls._currentId += 1
return cls._currentId

def __init__(self, datasetType, units):
units = datasetType.units.conform(units)
super().__init__(
datasetType.name,
**{unit.__class__.__name__: unit.value for unit in units}
)
def __init__(self, datasetType, dataId, id=None):
assert isinstance(datasetType, DatasetType)
self._id = id
self._datasetType = datasetType
self._units = units
self._dataId = dataId
self._producer = None
self._predictedConsumers = dict()
self._actualConsumers = dict()
self._components = dict()
self._assembler = None

@property
def id(self):
"""Primary key of the dataset (`int`)

Typically assigned by `Registry`.
"""
return self._id

@property
def datasetType(self):
"""The `DatasetType` associated with the `Dataset` the `DatasetRef`
points to.
"""
return self._type
return self._datasetType

@property
def units(self):
"""A `tuple` of `DataUnit` instances that label the `DatasetRef`
def dataId(self):
"""A `dict` of `DataUnit` name, value pairs that label the `DatasetRef`
within a Collection.
"""
return self._units
return self._dataId

@property
def producer(self):
Expand Down Expand Up @@ -173,14 +172,20 @@ def actualConsumers(self):
"""
return _safeMakeMappingProxyType(self._actualConsumers)

def makeStorageHint(self, run, template=None):
"""Construct a storage hint by filling in template with the Collection
collection and the values in the units tuple.
@property
def components(self):
"""Named `DatasetRef` components.

Read-only; update via `Registry.attachComponent()`.
"""
return _safeMakeMappingProxyType(self._components)

@property
def assembler(self):
"""Fully-qualified name of an importable Assembler object that can be
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remember that assemblers are classes with assemble and disassemble methods. You have to store the assembler class name, then to assemble you create an instance and run the assemble method.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not according to https://confluence.lsstcorp.org/display/DM/Gen3+Butler+Composites+Design
But I'm perfectly happy for them to be so. This is part of the composite work to be done on a different ticket.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you asked me to combine free functions into classes the code got significantly cleaner. I'll be surprised if we gain by pulling everything apart again. I was really happy with the way assembler/disassembler turned out.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I must admit that I hadn't taken that confluence page as gospel. I thought it was guiding principles so I haven't gone into edit it with my thoughts.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code on the confluence page is absolutely intended as just pseudocode. You're both very much encouraged to actively rethink all of it (the code parts, that is; I hope the conceptual stuff will actually stick this time around).

used to construct this Dataset from its components.

Although a `Dataset` may belong to multiple Collections, only the one
corresponding to its `Run` is used.
`None` for datasets that are not virtual composites.
Read-only; update via `Registry.setAssembler()`.
"""
if template is None:
template = self.datasetType.template
units = {unit.__class__.__name__: unit.value for unit in self.units}
return template.format(DatasetType=self.datasetType.name, Run=run.collection, **units)
return self._assembler
46 changes: 16 additions & 30 deletions python/lsst/daf/butler/core/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,46 +29,32 @@ class Run(object):

Parameters
----------
runId : `int`
ID to associate with this run.
registryId : `int`
ID associated with this `Registry`.
execution : `int`
A unique identifier for this Run, which is also the associated
Execution record.
collection : `str`
Collection to use for this run.
environment : `str`
Something about the environment.
pipeline : `str`
Something about the pipeline.
A Collection name with which all Datasets in this Run are initially
associated, also used as a human-readable name for this Run.
environment : `DatasetRef`
A reference to a dataset that contains a description of
the software environment (e.g. versions) used for this Run.
pipeline : `DatasetRef`
A reference to a dataset that contains a serialization of
the SuperTask Pipeline used for this Run (if any).
"""
_currentId = 0

@classmethod
def getNewId(cls):
cls._currentId += 1
return cls._currentId

__slots__ = ("_runId", "_registryId", "_collection", "_environment", "_pipeline")
__slots__ = ("_execution", "_collection", "_environment", "_pipeline")
__eq__ = slotValuesAreEqual
__hash__ = slotValuesToHash

def __init__(self, runId, registryId, collection, environment, pipeline):
self._runId = runId
self._registryId = registryId
def __init__(self, execution, collection, environment, pipeline):
self._execution = execution
self._collection = collection
self._environment = environment
self._pipeline = pipeline

@property
def pkey(self):
return (self._runId, self.registryId)

@property
def runId(self):
return self._runId

@property
def registryId(self):
return self._registryId
def execution(self):
return self._execution

@property
def collection(self):
Expand Down