Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-12627: Implement Butler DatasetType #20

Merged
merged 4 commits into from
Mar 16, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions config/registry/default_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,10 @@ tables:
name: storage_class
type: string
nullable: false
-
name: template
type: string
nullable: true

DatasetTypeUnits:
doc: >
Expand Down
35 changes: 17 additions & 18 deletions python/lsst/daf/butler/core/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

from types import MappingProxyType
from .utils import slotValuesAreEqual, slotValuesToHash
from .units import DataUnitSet

__all__ = ("DatasetType", "DatasetRef")

Expand All @@ -46,7 +45,7 @@ class DatasetType(object):
All arguments correspond directly to instance attributes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could really do with explicit Attributes and Parameters section.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought it was policy to document these in the property docstrings?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I thought that in this case there was a difference between init parameters and attributes (on phone so not checking).

"""

__slots__ = ("_name", "_template", "_units", "_storageClass")
__slots__ = ("_name", "_dataUnits", "_storageClass", "_template")
__eq__ = slotValuesAreEqual
__hash__ = slotValuesToHash

Expand All @@ -57,6 +56,19 @@ def name(self):
"""
return self._name

@property
def dataUnits(self):
"""A `frozenset` of `DataUnit` names that defines the `DatasetRef`\ s
corresponding to this `DatasetType`.
"""
return self._dataUnits

@property
def storageClass(self):
"""A `StorageClass` that defines how this `DatasetType` is persisted.
"""
return self._storageClass

@property
def template(self):
"""A string with `str`.format-style replacement patterns that can be
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not related to the file name template used by datastore. Correct? (so this template is not a FileTemplate).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is related to that template. But it is unclear (to me at the moment) if we still want it here. I'll remove it for now.

Expand All @@ -68,24 +80,11 @@ def template(self):
"""
return self._template

@property
def units(self):
"""A `DataUnitSet` that defines the `DatasetRef`\ s corresponding
to this `DatasetType`.
"""
return self._units

@property
def storageClass(self):
"""A `StorageClass` that defines how this `DatasetType` is persisted.
"""
return self._storageClass

def __init__(self, name, template, units, storageClass):
def __init__(self, name, dataUnits, storageClass, template=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the dataUnits argument here a frozenset and you copy it in the constructor?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may be any iterable, but it becomes a frozenset in the constructor.

self._name = name
self._template = template
self._units = DataUnitSet(units)
self._dataUnits = frozenset(dataUnits)
self._storageClass = storageClass
self._template = template


class DatasetRef(object):
Expand Down
52 changes: 50 additions & 2 deletions python/lsst/daf/butler/registries/sqlRegistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from sqlalchemy import create_engine
from sqlalchemy.sql import select

from ..core.datasets import DatasetType
from ..core.registry import RegistryConfig, Registry
from ..core.schema import Schema

Expand All @@ -39,13 +41,24 @@ class SqlRegistry(Registry):
config : `SqlRegistryConfig` or `str`
Load configuration
"""

def __init__(self, config):
super().__init__(config)

self.config = SqlRegistryConfig(config)
self._schema = Schema(self.config['schema'])
self._engine = create_engine(self.config['db'])
self._schema.metadata.create_all(self._engine)
self._datasetTypes = {}

def _isValidDatasetType(self, datasetType):
"""Check if given `DatasetType` instance is valid for this `Registry`.

.. todo::

Insert checks for `storageClass`, `dataUnits` and `template`.
"""
return isinstance(datasetType, DatasetType)

def registerDatasetType(self, datasetType):
"""
Expand All @@ -56,7 +69,21 @@ def registerDatasetType(self, datasetType):
datasetType : `DatasetType`
The `DatasetType` to be added.
"""
raise NotImplementedError("Must be implemented by subclass")
if not self._isValidDatasetType(datasetType):
raise ValueError("DatasetType is not valid for this registry")
if datasetType.name in self._datasetTypes:
raise KeyError("DatasetType: {} already registered".format(datasetType.name))
datasetTypeTable = self._schema.metadata.tables['DatasetType']
datasetTypeUnitsTable = self._schema.metadata.tables['DatasetTypeUnits']
with self._engine.begin() as connection:
connection.execute(datasetTypeTable.insert().values(dataset_type_name=datasetType.name,
storage_class=datasetType.storageClass,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

StorageClass doesn't have a __str__ method so what gets inserted into the table? Does it magically use StorageClass.name? Is it the full class name? StorageClasses are defined by the datastore so we have to work out how to verify that a registry is not specifying storage class names that a datastore doesn't know about.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a name of a StorageClass. Not an instance.

template=datasetType.template))
if datasetType.dataUnits:
connection.execute(datasetTypeUnitsTable.insert(),
[{'dataset_type_name': datasetType.name, 'unit_name': dataUnitName}
for dataUnitName in datasetType.dataUnits])
self._datasetTypes[datasetType.name] = datasetType

def getDatasetType(self, name):
"""Get the `DatasetType`.
Expand All @@ -71,7 +98,28 @@ def getDatasetType(self, name):
type : `DatasetType`
The `DatasetType` associated with the given name.
"""
raise NotImplementedError("Must be implemented by subclass")
datasetType = None
if name in self._datasetTypes:
datasetType = self._datasetTypes[name]
else:
datasetTypeTable = self._schema.metadata.tables['DatasetType']
datasetTypeUnitsTable = self._schema.metadata.tables['DatasetTypeUnits']
with self._engine.begin() as connection:
# Get StorageClass and template from DatasetType table
result = connection.execute(select([datasetTypeTable.c.storage_class,
datasetTypeTable.c.template]).where(
datasetTypeTable.c.dataset_type_name == name)).fetchone()
storageClass = result['storage_class']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But DatasetType.storageClass is documented to be a StorageClass not a str.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll change the docstring. Should be a name (for now at least).

template = result['template']
# Get DataUnits (if any) from DatasetTypeUnits table
result = connection.execute(select([datasetTypeUnitsTable.c.unit_name]).where(
datasetTypeUnitsTable.c.dataset_type_name == name)).fetchall()
dataUnits = (r[0] for r in result) if result else ()
datasetType = DatasetType(name=name,
storageClass=storageClass,
dataUnits=dataUnits,
template=template)
return datasetType

def addDataset(self, ref, uri, components, run, producer=None):
"""Add a `Dataset` to a Collection.
Expand Down
93 changes: 93 additions & 0 deletions tests/test_datasetType.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import unittest

import lsst.utils.tests

from lsst.daf.butler.core.datasets import DatasetType

"""Tests for SqlRegistry.
"""


class DatasetTypeTestCase(lsst.utils.tests.TestCase):
"""Test for DatasetType.
"""
def setUp(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to define it if it doesn't do anything.

pass

def testConstructor(self):
"""Test construction preserves values.

Note that construction doesn't check for valid storageClass,
dataUnits or template parameters.
These can only be verified for a particular schema.
"""
datasetTypeName = "test"
storageClass = "StructuredData"
dataUnits = frozenset(("camera", "visit"))
template = "{datasetType}/{camera}/{visit}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm worried that we have two distinct template systems now.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should really be the same, but I'll remove it since this seems to be just a remnant of the old system.

datasetType = DatasetType(datasetTypeName, dataUnits, storageClass, template)
self.assertEqual(datasetType.name, datasetTypeName)
self.assertEqual(datasetType.storageClass, storageClass)
self.assertEqual(datasetType.dataUnits, dataUnits)
self.assertEqual(datasetType.template, template)

def testEquality(self):
self.assertEqual(DatasetType("a", "StorageA", ("UnitA", )),
DatasetType("a", "StorageA", ("UnitA", )))
self.assertEqual(DatasetType("a", "StorageA", ("UnitA", ), "{UnitA}"),
DatasetType("a", "StorageA", ("UnitA", ), "{UnitA}"))
self.assertNotEqual(DatasetType("a", "StorageA", ("UnitA", ), "{UnitA}"),
DatasetType("b", "StorageA", ("UnitA", ), "{UnitA}"))
self.assertNotEqual(DatasetType("a", "StorageA", ("UnitA", ), "{UnitA}"),
DatasetType("a", "StorageB", ("UnitA", ), "{UnitA}"))
self.assertNotEqual(DatasetType("a", "StorageA", ("UnitA", ), "{UnitA}"),
DatasetType("a", "StorageA", ("UnitB", ), "{UnitA}"))
self.assertNotEqual(DatasetType("a", "StorageA", ("UnitA", ), "{UnitA}"),
DatasetType("a", "StorageA", ("UnitA", ), "{UnitB}"))

def testHashability(self):
types = []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this test method needs a docstring to explain what is actually being tested. There's a lot going on here before the final conversion of the list to a set.

unique = 0
for name in ["a", "b"]:
for storageClass in ["c", "d"]:
for dataUnits in [("e", ), ("f", )]:
for template in ["g", "h"]:
datasetType = DatasetType(name, storageClass, dataUnits, template)
datasetTypeCopy = DatasetType(name, storageClass, dataUnits, template)
types.extend((datasetType, datasetTypeCopy))
unique += 1
self.assertEqual(len(set(types)), unique)


class MemoryTester(lsst.utils.tests.MemoryTestCase):
pass


def setup_module(module):
lsst.utils.tests.init()


if __name__ == "__main__":
lsst.utils.tests.init()
unittest.main()
27 changes: 27 additions & 0 deletions tests/test_sqlRegistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import lsst.utils.tests

from lsst.daf.butler.core.datasets import DatasetType
from lsst.daf.butler.core.registry import Registry
from lsst.daf.butler.registries.sqlRegistry import SqlRegistry

Expand All @@ -42,6 +43,32 @@ def testInitFromConfig(self):
registry = Registry.fromConfig(self.configFile)
self.assertIsInstance(registry, SqlRegistry)

def testDatasetType(self):
registry = Registry.fromConfig(self.configFile)
# Check valid insert
datasetTypeName = "test"
storageClass = "StructuredData"
dataUnits = ("camera", "visit")
template = "{datasetType}/{camera}/{visit}"
inDatasetType = DatasetType(datasetTypeName, dataUnits, storageClass)
registry.registerDatasetType(inDatasetType)
outDatasetType = registry.getDatasetType(datasetTypeName)
self.assertEqual(outDatasetType, inDatasetType)

# Re-inserting should fail
with self.assertRaises(KeyError):
registry.registerDatasetType(inDatasetType)

# Template can be None
datasetTypeName = "testNoneTemplate"
storageClass = "StructuredData"
dataUnits = ("camera", "visit")
template = None
inDatasetType = DatasetType(datasetTypeName, dataUnits, storageClass, template)
registry.registerDatasetType(inDatasetType)
outDatasetType = registry.getDatasetType(datasetTypeName)
self.assertEqual(outDatasetType, inDatasetType)


class MemoryTester(lsst.utils.tests.MemoryTestCase):
pass
Expand Down