Skip to content

Commit

Permalink
Add remove method to Butler.
Browse files Browse the repository at this point in the history
  • Loading branch information
TallJimbo committed Feb 6, 2019
1 parent 0173158 commit a554741
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 36 deletions.
59 changes: 59 additions & 0 deletions python/lsst/daf/butler/butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import os
import contextlib
import logging
import itertools

from lsst.utils import doImport
from .core.utils import transactional
Expand Down Expand Up @@ -486,3 +487,61 @@ def datasetExists(self, datasetRefOrType, dataId=None, **kwds):
"{} with {} not found in collection {}".format(datasetType, dataId, self.collection)
)
return self.datastore.exists(ref)

def remove(self, datasetRefOrType, dataId=None, *, delete=True, remember=True, **kwds):
"""Remove a dataset from the collection and possibly the repository.
The identified dataset is always at least removed from the Butler's
collection. By default it is also deleted from the Datastore (e.g.
files are actually deleted), but the dataset is "remembered" by
retaining its row in the dataset and provenance tables in the registry.
If the dataset is a composite, all components will also be removed.
Parameters
----------
datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
When `DatasetRef` the `dataId` should be `None`.
Otherwise the `DatasetType` or name thereof.
dataId : `dict` or `DataId`
A `dict` of `Dimension` link name, value pairs that label the
`DatasetRef` within a Collection. When `None`, a `DatasetRef`
should be provided as the second argument.
delete : `bool`
If `True` (default) actually delete the dataset from the
Datastore (i.e. actually remove files).
remember : `bool`
If `True` (default), retain dataset and provenance records in
the `Registry` for this dataset.
kwds
Additional keyword arguments used to augment or construct a
`DataId`. See `DataId` parameters.
Raises
------
ValueError
Raised if ``delete`` and ``remember`` are both `False`; a dataset
cannot remain in a `Datastore` if all of its `Registry` entries are
removed.
OrphanedRecordError
Raised if ``remember`` is `False` but the dataset is still present
in a `Datastore` not recognized by this `Butler` client.
"""
datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
ref = self.registry.find(self.collection, datasetType, dataId, **kwds)
if delete:
for r in itertools.chain([ref], ref.components.values()):
# If dataset is a composite, we don't know whether it's the
# parent or the components that actually need to be removed,
# so try them all and swallow errors.
try:
self.datastore.remove(r)
except FileNotFoundError:
pass
elif not remember:
raise ValueError("Cannot retain dataset in Datastore without keeping Registry dataset record.")
if remember:
self.registry.disassociate(self.collection, [ref])
else:
# This also implicitly disassociates.
self.registry.removeDataset(ref)
100 changes: 64 additions & 36 deletions tests/test_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,7 @@ def runPutGetTest(self, storageClass, datasetTypeName):
# Create and register a DatasetType
dimensions = ("Instrument", "Visit")

# We can not delete datasets so for now create two so we can do
# two puts.
self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)

datasetTypeName2 = datasetTypeName + "2"
self.addDatasetType(datasetTypeName2, dimensions, storageClass, butler.registry)

# Add a third type to test putting with a DataSetType
datasetTypeName3 = datasetTypeName + "3"
self.addDatasetType(datasetTypeName3, dimensions, storageClass, butler.registry)
datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)

# Add needed Dimensions
butler.registry.addDimensionEntry("Instrument", {"instrument": "DummyCamComp"})
Expand All @@ -139,38 +130,75 @@ def runPutGetTest(self, storageClass, datasetTypeName):
dataId = {"instrument": "DummyCamComp", "visit": 423}

# Create a DatasetRef for put
datasetType2 = butler.registry.getDatasetType(datasetTypeName2)
ref2 = DatasetRef(datasetType2, dataId, id=None)

datasetType3 = butler.registry.getDatasetType(datasetTypeName3)
refIn = DatasetRef(datasetType, dataId, id=None)

# Put with a preexisting id should fail
with self.assertRaises(ValueError):
butler.put(metric, DatasetRef(datasetType2, dataId, id=100))

# Put the dataset once as a DatasetRef, once as a dataId, and once with a DataSetType
for args in ((ref2,), (datasetTypeName, dataId), (datasetType3, dataId)):
ref = butler.put(metric, *args)
self.assertIsInstance(ref, DatasetRef)

# Test getDirect
metricOut = butler.getDirect(ref)
self.assertEqual(metric, metricOut)
# Test get
metricOut = butler.get(ref.datasetType.name, dataId)
self.assertEqual(metric, metricOut)
# Test get with a datasetRef
metricOut = butler.get(ref)
self.assertEqual(metric, metricOut)

# Check we can get components
if storageClass.isComposite():
self.assertGetComponents(butler, ref,
("summary", "data", "output"), metric)
butler.put(metric, DatasetRef(datasetType, dataId, id=100))

# Put and remove the dataset once as a DatasetRef, once as a dataId, and once with a DatasetType
for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
with self.subTest(args=args):
ref = butler.put(metric, *args)
self.assertIsInstance(ref, DatasetRef)

# Test getDirect
metricOut = butler.getDirect(ref)
self.assertEqual(metric, metricOut)
# Test get
metricOut = butler.get(ref.datasetType.name, dataId)
self.assertEqual(metric, metricOut)
# Test get with a datasetRef
metricOut = butler.get(ref)
self.assertEqual(metric, metricOut)

# Check we can get components
if storageClass.isComposite():
self.assertGetComponents(butler, ref,
("summary", "data", "output"), metric)

# Remove from collection only; after that we shouldn't be able
# to find it unless we use the dataset_id.
butler.remove(*args, delete=False)
with self.assertRaises(LookupError):
butler.datasetExists(*args)
# If we use the output ref with the dataset_id, we should still be
# able to load it with getDirect().
self.assertEqual(metric, butler.getDirect(ref))

# Reinsert into collection, then delete from Datastore *and*
# remove from collection.
butler.registry.associate(butler.collection, [ref])
butler.remove(*args)
# Lookup with original args should still fail.
with self.assertRaises(LookupError):
butler.datasetExists(*args)
# Now getDirect() should fail, too.
with self.assertRaises(FileNotFoundError):
butler.getDirect(ref)
# Registry still knows about it, if we use the dataset_id.
self.assertEqual(butler.registry.getDataset(ref.id), ref)

# Put again, then remove completely (this generates a new dataset
# record in registry, with a new ID - the old one still exists,
# but it is not in any collection so we don't care).
ref = butler.put(metric, *args)
butler.remove(*args, remember=False)
# Lookup with original args should still fail.
with self.assertRaises(LookupError):
butler.datasetExists(*args)
# getDirect() should still fail.
with self.assertRaises(FileNotFoundError):
butler.getDirect(ref)
# Registry shouldn't be able to find it by dataset_id anymore.
self.assertIsNone(butler.registry.getDataset(ref.id))

# Put the dataset again, since the last thing we did was remove it.
ref = butler.put(metric, refIn)

# Get with parameters
stop = 4
sliced = butler.get(ref2, parameters={"slice": slice(stop)})
sliced = butler.get(ref, parameters={"slice": slice(stop)})
self.assertNotEqual(metric, sliced)
self.assertEqual(metric.summary, sliced.summary)
self.assertEqual(metric.output, sliced.output)
Expand Down

0 comments on commit a554741

Please sign in to comment.