Add remove method to Butler.

lsst · Feb 6, 2019 · a554741 · a554741
1 parent 0173158
commit a554741
Show file tree

Hide file tree

Showing 2 changed files with 123 additions and 36 deletions.
diff --git a/python/lsst/daf/butler/butler.py b/python/lsst/daf/butler/butler.py
@@ -26,6 +26,7 @@
 import os
 import contextlib
 import logging
+import itertools
 
 from lsst.utils import doImport
 from .core.utils import transactional
@@ -486,3 +487,61 @@ def datasetExists(self, datasetRefOrType, dataId=None, **kwds):
                 "{} with {} not found in collection {}".format(datasetType, dataId, self.collection)
             )
         return self.datastore.exists(ref)
+
+    def remove(self, datasetRefOrType, dataId=None, *, delete=True, remember=True, **kwds):
+        """Remove a dataset from the collection and possibly the repository.
+
+        The identified dataset is always at least removed from the Butler's
+        collection.  By default it is also deleted from the Datastore (e.g.
+        files are actually deleted), but the dataset is "remembered" by
+        retaining its row in the dataset and provenance tables in the registry.
+
+        If the dataset is a composite, all components will also be removed.
+
+        Parameters
+        ----------
+        datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
+            When `DatasetRef` the `dataId` should be `None`.
+            Otherwise the `DatasetType` or name thereof.
+        dataId : `dict` or `DataId`
+            A `dict` of `Dimension` link name, value pairs that label the
+            `DatasetRef` within a Collection. When `None`, a `DatasetRef`
+            should be provided as the second argument.
+        delete : `bool`
+            If `True` (default) actually delete the dataset from the
+            Datastore (i.e. actually remove files).
+        remember : `bool`
+            If `True` (default), retain dataset and provenance records in
+            the `Registry` for this dataset.
+        kwds
+            Additional keyword arguments used to augment or construct a
+            `DataId`.  See `DataId` parameters.
+
+        Raises
+        ------
+        ValueError
+            Raised if ``delete`` and ``remember`` are both `False`; a dataset
+            cannot remain in a `Datastore` if all of its `Registry` entries are
+            removed.
+        OrphanedRecordError
+            Raised if ``remember`` is `False` but the dataset is still present
+            in a `Datastore` not recognized by this `Butler` client.
+        """
+        datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
+        ref = self.registry.find(self.collection, datasetType, dataId, **kwds)
+        if delete:
+            for r in itertools.chain([ref], ref.components.values()):
+                # If dataset is a composite, we don't know whether it's the
+                # parent or the components that actually need to be removed,
+                # so try them all and swallow errors.
+                try:
+                    self.datastore.remove(r)
+                except FileNotFoundError:
+                    pass
+        elif not remember:
+            raise ValueError("Cannot retain dataset in Datastore without keeping Registry dataset record.")
+        if remember:
+            self.registry.disassociate(self.collection, [ref])
+        else:
+            # This also implicitly disassociates.
+            self.registry.removeDataset(ref)
diff --git a/tests/test_butler.py b/tests/test_butler.py
@@ -116,16 +116,7 @@ def runPutGetTest(self, storageClass, datasetTypeName):
         # Create and register a DatasetType
         dimensions = ("Instrument", "Visit")
 
-        # We can not delete datasets so for now create two so we can do
-        # two puts.
-        self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
-
-        datasetTypeName2 = datasetTypeName + "2"
-        self.addDatasetType(datasetTypeName2, dimensions, storageClass, butler.registry)
-
-        # Add a third type to test putting with a DataSetType
-        datasetTypeName3 = datasetTypeName + "3"
-        self.addDatasetType(datasetTypeName3, dimensions, storageClass, butler.registry)
+        datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
 
         # Add needed Dimensions
         butler.registry.addDimensionEntry("Instrument", {"instrument": "DummyCamComp"})
@@ -139,38 +130,75 @@ def runPutGetTest(self, storageClass, datasetTypeName):
         dataId = {"instrument": "DummyCamComp", "visit": 423}
 
         # Create a DatasetRef for put
-        datasetType2 = butler.registry.getDatasetType(datasetTypeName2)
-        ref2 = DatasetRef(datasetType2, dataId, id=None)
-
-        datasetType3 = butler.registry.getDatasetType(datasetTypeName3)
+        refIn = DatasetRef(datasetType, dataId, id=None)
 
         # Put with a preexisting id should fail
         with self.assertRaises(ValueError):
-            butler.put(metric, DatasetRef(datasetType2, dataId, id=100))
-
-        # Put the dataset once as a DatasetRef, once as a dataId, and once with a DataSetType
-        for args in ((ref2,), (datasetTypeName, dataId), (datasetType3, dataId)):
-            ref = butler.put(metric, *args)
-            self.assertIsInstance(ref, DatasetRef)
-
-            # Test getDirect
-            metricOut = butler.getDirect(ref)
-            self.assertEqual(metric, metricOut)
-            # Test get
-            metricOut = butler.get(ref.datasetType.name, dataId)
-            self.assertEqual(metric, metricOut)
-            # Test get with a datasetRef
-            metricOut = butler.get(ref)
-            self.assertEqual(metric, metricOut)
-
-            # Check we can get components
-            if storageClass.isComposite():
-                self.assertGetComponents(butler, ref,
-                                         ("summary", "data", "output"), metric)
+            butler.put(metric, DatasetRef(datasetType, dataId, id=100))
+
+        # Put and remove the dataset once as a DatasetRef, once as a dataId, and once with a DatasetType
+        for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
+            with self.subTest(args=args):
+                ref = butler.put(metric, *args)
+                self.assertIsInstance(ref, DatasetRef)
+
+                # Test getDirect
+                metricOut = butler.getDirect(ref)
+                self.assertEqual(metric, metricOut)
+                # Test get
+                metricOut = butler.get(ref.datasetType.name, dataId)
+                self.assertEqual(metric, metricOut)
+                # Test get with a datasetRef
+                metricOut = butler.get(ref)
+                self.assertEqual(metric, metricOut)
+
+                # Check we can get components
+                if storageClass.isComposite():
+                    self.assertGetComponents(butler, ref,
+                                             ("summary", "data", "output"), metric)
+
+                # Remove from collection only; after that we shouldn't be able
+                # to find it unless we use the dataset_id.
+                butler.remove(*args, delete=False)
+                with self.assertRaises(LookupError):
+                    butler.datasetExists(*args)
+                # If we use the output ref with the dataset_id, we should still be
+                # able to load it with getDirect().
+                self.assertEqual(metric, butler.getDirect(ref))
+
+                # Reinsert into collection, then delete from Datastore *and*
+                # remove from collection.
+                butler.registry.associate(butler.collection, [ref])
+                butler.remove(*args)
+                # Lookup with original args should still fail.
+                with self.assertRaises(LookupError):
+                    butler.datasetExists(*args)
+                # Now getDirect() should fail, too.
+                with self.assertRaises(FileNotFoundError):
+                    butler.getDirect(ref)
+                # Registry still knows about it, if we use the dataset_id.
+                self.assertEqual(butler.registry.getDataset(ref.id), ref)
+
+                # Put again, then remove completely (this generates a new dataset
+                # record in registry, with a new ID - the old one still exists,
+                # but it is not in any collection so we don't care).
+                ref = butler.put(metric, *args)
+                butler.remove(*args, remember=False)
+                # Lookup with original args should still fail.
+                with self.assertRaises(LookupError):
+                    butler.datasetExists(*args)
+                # getDirect() should still fail.
+                with self.assertRaises(FileNotFoundError):
+                    butler.getDirect(ref)
+                # Registry shouldn't be able to find it by dataset_id anymore.
+                self.assertIsNone(butler.registry.getDataset(ref.id))
+
+        # Put the dataset again, since the last thing we did was remove it.
+        ref = butler.put(metric, refIn)
 
         # Get with parameters
         stop = 4
-        sliced = butler.get(ref2, parameters={"slice": slice(stop)})
+        sliced = butler.get(ref, parameters={"slice": slice(stop)})
         self.assertNotEqual(metric, sliced)
         self.assertEqual(metric.summary, sliced.summary)
         self.assertEqual(metric.output, sliced.output)