Merge pull request #20 from lsst/tickets/DM-12627

DM-12627: Implement Butler DatasetType
lsst · Mar 16, 2018 · e9143ed · e9143ed
2 parents a10afbb + 2d5f7d3
commit e9143ed
Show file tree

Hide file tree

Showing 4 changed files with 184 additions and 24 deletions.
diff --git a/python/lsst/daf/butler/core/datasets.py b/python/lsst/daf/butler/core/datasets.py
@@ -21,7 +21,6 @@
 
 from types import MappingProxyType
 from .utils import slotValuesAreEqual, slotValuesToHash
-from .units import DataUnitSet
 
 __all__ = ("DatasetType", "DatasetRef")
 
@@ -43,10 +42,20 @@ class DatasetType(object):
     may be added.
     `DatasetType` instances are immutable.
 
-    All arguments correspond directly to instance attributes.
+    Parameters
+    ----------
+    name : `str`
+        A string name for the `Dataset`; must correspond to the same
+        `DatasetType` across all Registries.
+    dataUnits : `iterable` of `str`
+        `DataUnit` names that defines the `DatasetRef`\ s corresponding to
+        this `DatasetType`.  The input iterable is copied into a `frozenset`.
+    storageClass : `str`
+        Name of a `StorageClass` that defines how this `DatasetType`
+        is persisted.
     """
 
-    __slots__ = ("_name", "_template", "_units", "_storageClass")
+    __slots__ = ("_name", "_dataUnits", "_storageClass")
     __eq__ = slotValuesAreEqual
     __hash__ = slotValuesToHash
 
@@ -58,33 +67,22 @@ def name(self):
         return self._name
 
     @property
-    def template(self):
-        """A string with `str`.format-style replacement patterns that can be
-        used to create a path from a `Run`
-        (and optionally its associated Collection) and a `DatasetRef`.
-
-        May be `None` to indicate a read-only `Dataset` or one whose templates
-        must be provided at a higher level.
+    def dataUnits(self):
+        """A `frozenset` of `DataUnit` names that defines the `DatasetRef`\ s
+        corresponding to this `DatasetType`.
         """
-        return self._template
-
-    @property
-    def units(self):
-        """A `DataUnitSet` that defines the `DatasetRef`\ s corresponding
-        to this `DatasetType`.
-        """
-        return self._units
+        return self._dataUnits
 
     @property
     def storageClass(self):
-        """A `StorageClass` that defines how this `DatasetType` is persisted.
+        """Name of a `StorageClass` that defines how this `DatasetType`
+        is persisted.
         """
         return self._storageClass
 
-    def __init__(self, name, template, units, storageClass):
+    def __init__(self, name, dataUnits, storageClass):
         self._name = name
-        self._template = template
-        self._units = DataUnitSet(units)
+        self._dataUnits = frozenset(dataUnits)
         self._storageClass = storageClass
 
 

diff --git a/python/lsst/daf/butler/registries/sqlRegistry.py b/python/lsst/daf/butler/registries/sqlRegistry.py
@@ -20,7 +20,9 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 from sqlalchemy import create_engine
+from sqlalchemy.sql import select
 
+from ..core.datasets import DatasetType
 from ..core.registry import RegistryConfig, Registry
 from ..core.schema import Schema
 
@@ -39,13 +41,24 @@ class SqlRegistry(Registry):
     config : `SqlRegistryConfig` or `str`
         Load configuration
     """
+
     def __init__(self, config):
         super().__init__(config)
 
         self.config = SqlRegistryConfig(config)
         self._schema = Schema(self.config['schema'])
         self._engine = create_engine(self.config['db'])
         self._schema.metadata.create_all(self._engine)
+        self._datasetTypes = {}
+
+    def _isValidDatasetType(self, datasetType):
+        """Check if given `DatasetType` instance is valid for this `Registry`.
+
+        .. todo::
+
+            Insert checks for `storageClass`, `dataUnits` and `template`.
+        """
+        return isinstance(datasetType, DatasetType)
 
     def registerDatasetType(self, datasetType):
         """
@@ -56,7 +69,20 @@ def registerDatasetType(self, datasetType):
         datasetType : `DatasetType`
             The `DatasetType` to be added.
         """
-        raise NotImplementedError("Must be implemented by subclass")
+        if not self._isValidDatasetType(datasetType):
+            raise ValueError("DatasetType is not valid for this registry")
+        if datasetType.name in self._datasetTypes:
+            raise KeyError("DatasetType: {} already registered".format(datasetType.name))
+        datasetTypeTable = self._schema.metadata.tables['DatasetType']
+        datasetTypeUnitsTable = self._schema.metadata.tables['DatasetTypeUnits']
+        with self._engine.begin() as connection:
+            connection.execute(datasetTypeTable.insert().values(dataset_type_name=datasetType.name,
+                                                                storage_class=datasetType.storageClass))
+            if datasetType.dataUnits:
+                connection.execute(datasetTypeUnitsTable.insert(),
+                                   [{'dataset_type_name': datasetType.name, 'unit_name': dataUnitName}
+                                    for dataUnitName in datasetType.dataUnits])
+            self._datasetTypes[datasetType.name] = datasetType
 
     def getDatasetType(self, name):
         """Get the `DatasetType`.
@@ -71,7 +97,25 @@ def getDatasetType(self, name):
         type : `DatasetType`
             The `DatasetType` associated with the given name.
         """
-        raise NotImplementedError("Must be implemented by subclass")
+        datasetType = None
+        if name in self._datasetTypes:
+            datasetType = self._datasetTypes[name]
+        else:
+            datasetTypeTable = self._schema.metadata.tables['DatasetType']
+            datasetTypeUnitsTable = self._schema.metadata.tables['DatasetTypeUnits']
+            with self._engine.begin() as connection:
+                # Get StorageClass from DatasetType table
+                result = connection.execute(select([datasetTypeTable.c.storage_class]).where(
+                    datasetTypeTable.c.dataset_type_name == name)).fetchone()
+                storageClass = result['storage_class']
+                # Get DataUnits (if any) from DatasetTypeUnits table
+                result = connection.execute(select([datasetTypeUnitsTable.c.unit_name]).where(
+                    datasetTypeUnitsTable.c.dataset_type_name == name)).fetchall()
+                dataUnits = (r[0] for r in result) if result else ()
+                datasetType = DatasetType(name=name,
+                                          storageClass=storageClass,
+                                          dataUnits=dataUnits)
+        return datasetType
 
     def addDataset(self, ref, uri, components, run, producer=None):
         """Add a `Dataset` to a Collection.

diff --git a/tests/test_datasetType.py b/tests/test_datasetType.py
@@ -0,0 +1,93 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+
+import lsst.utils.tests
+
+from lsst.daf.butler.core.datasets import DatasetType
+
+"""Tests for DatasetType.
+"""
+
+
+class DatasetTypeTestCase(lsst.utils.tests.TestCase):
+    """Test for DatasetType.
+    """
+    def testConstructor(self):
+        """Test construction preserves values.
+
+        Note that construction doesn't check for valid storageClass or
+        dataUnits parameters.
+        These can only be verified for a particular schema.
+        """
+        datasetTypeName = "test"
+        storageClass = "StructuredData"
+        dataUnits = frozenset(("camera", "visit"))
+        datasetType = DatasetType(datasetTypeName, dataUnits, storageClass)
+        self.assertEqual(datasetType.name, datasetTypeName)
+        self.assertEqual(datasetType.storageClass, storageClass)
+        self.assertEqual(datasetType.dataUnits, dataUnits)
+
+    def testEquality(self):
+        self.assertEqual(DatasetType("a", "StorageA", ("UnitA", )),
+                         DatasetType("a", "StorageA", ("UnitA", )))
+        self.assertNotEqual(DatasetType("a", "StorageA", ("UnitA", )),
+                            DatasetType("b", "StorageA", ("UnitA", )))
+        self.assertNotEqual(DatasetType("a", "StorageA", ("UnitA", )),
+                            DatasetType("a", "StorageB", ("UnitA", )))
+        self.assertNotEqual(DatasetType("a", "StorageA", ("UnitA", )),
+                            DatasetType("a", "StorageA", ("UnitB", )))
+
+    def testHashability(self):
+        """Test `DatasetType.__hash__`.
+
+        This test is performed by checking that `DatasetType` entries can
+        be inserted into a `set` and that unique values of its
+        (`name`, `storageClass`, `dataUnits`) parameters result in separate
+        entries (and equal ones don't).
+
+        This does not check for uniformity of hashing or the actual values
+        of the hash function.
+        """
+        types = []
+        unique = 0
+        for name in ["a", "b"]:
+            for storageClass in ["c", "d"]:
+                for dataUnits in [("e", ), ("f", )]:
+                    datasetType = DatasetType(name, storageClass, dataUnits)
+                    datasetTypeCopy = DatasetType(name, storageClass, dataUnits)
+                    types.extend((datasetType, datasetTypeCopy))
+                    unique += 1  # datasetType should always equal its copy
+        self.assertEqual(len(set(types)), unique)  # all other combinations are unique
+
+
+class MemoryTester(lsst.utils.tests.MemoryTestCase):
+    pass
+
+
+def setup_module(module):
+    lsst.utils.tests.init()
+
+
+if __name__ == "__main__":
+    lsst.utils.tests.init()
+    unittest.main()
diff --git a/tests/test_sqlRegistry.py b/tests/test_sqlRegistry.py
@@ -24,6 +24,7 @@
 
 import lsst.utils.tests
 
+from lsst.daf.butler.core.datasets import DatasetType
 from lsst.daf.butler.core.registry import Registry
 from lsst.daf.butler.registries.sqlRegistry import SqlRegistry
 
@@ -42,6 +43,30 @@ def testInitFromConfig(self):
         registry = Registry.fromConfig(self.configFile)
         self.assertIsInstance(registry, SqlRegistry)
 
+    def testDatasetType(self):
+        registry = Registry.fromConfig(self.configFile)
+        # Check valid insert
+        datasetTypeName = "test"
+        storageClass = "StructuredData"
+        dataUnits = ("camera", "visit")
+        inDatasetType = DatasetType(datasetTypeName, dataUnits, storageClass)
+        registry.registerDatasetType(inDatasetType)
+        outDatasetType = registry.getDatasetType(datasetTypeName)
+        self.assertEqual(outDatasetType, inDatasetType)
+
+        # Re-inserting should fail
+        with self.assertRaises(KeyError):
+            registry.registerDatasetType(inDatasetType)
+
+        # Template can be None
+        datasetTypeName = "testNoneTemplate"
+        storageClass = "StructuredData"
+        dataUnits = ("camera", "visit")
+        inDatasetType = DatasetType(datasetTypeName, dataUnits, storageClass)
+        registry.registerDatasetType(inDatasetType)
+        outDatasetType = registry.getDatasetType(datasetTypeName)
+        self.assertEqual(outDatasetType, inDatasetType)
+
 
 class MemoryTester(lsst.utils.tests.MemoryTestCase):
     pass