Skip to content

Commit

Permalink
Use templates to work out file names from data units
Browse files Browse the repository at this point in the history
Templates are specified in the YAML configuration file and indexed
by dataset type.
  • Loading branch information
timj committed Feb 23, 2018
1 parent d32b45a commit 497b9cf
Show file tree
Hide file tree
Showing 7 changed files with 327 additions and 27 deletions.
39 changes: 39 additions & 0 deletions python/lsst/daf/butler/core/dataUnits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Code relating to DataUnits."""


class DataUnits:
"""Represent DataUnits specification.
Parameters
----------
units : `dict`
Dictionary of data units keys and values.
"""

def __init__(self, units):
self.units = units.copy()

def definedUnits(self):
"""DataUnits with non-None values."""
return {k: v for k, v in self.units.items() if v is not None}
137 changes: 137 additions & 0 deletions python/lsst/daf/butler/core/fileTemplates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Support for file template string expansion."""

import os.path
import string


class FileTemplate:
"""Format a path template into a fully expanded path.
Parameters
----------
template : `str`
Template string.
Notes
-----
The templates use the standard Format Specification Mini-Language
with the caveat that only named fields can be used. The field names
are taken from the DataUnits along with two additional fields:
"datasettype" will be replaced with the DataSetType and "component"
will be replaced with the component name of a composite.
The mini-language is extended to understand a "?" in the format
specification. This indicates that a field is optional. If that
DataUnit is missing the field, along with the text before the field,
unless it is a path separator, will be removed from the output path.
"""

def __init__(self, template):
self.template = template

def format(self, dataunits, datasettype=None, component=None):
"""Format a template string into a full path.
Parameters
----------
dataunits : `DataUnits`
DataUnits and the corresponding values.
datasettype : `str`, optional.
DataSetType name to use if needed. If it contains a "." separator
the type name will be split up into the main DataSetType and a
component.
component : `str`, optional
Component of a composite. If `datasettype` defines a component
this parameter will be ignored.
Returns
-------
path : `str`
Expanded path.
Raises
------
KeyError
Requested field is not defined and the field is not optional.
Or, `component` is specified but "component" was not part of
the template.
"""
units = dataunits.definedUnits()

if datasettype is not None:
# calexp.wcs means wcs component of a calexp
if "." in datasettype:
datasettype, component = datasettype.split(".", maxsplit=1)
units["datasettype"] = datasettype

usedComponent = False
if component is not None:
units["component"] = component

fmt = string.Formatter()
parts = fmt.parse(self.template)
output = ""

for literal, field_name, format_spec, conversion in parts:

if field_name == "component":
usedComponent = True

if "?" in format_spec:
optional = True
# Remove the non-standard character from the spec
format_spec = format_spec.replace("?", "")
else:
optional = False

if field_name in units:
value = units[field_name]
elif optional:
# If this is optional ignore the format spec
# and do not include the literal text prior to the optional
# field unless it contains a "/" path separator
format_spec = ""
value = ""
if "/" not in literal:
literal = ""
else:
raise KeyError("{} requested in template but not defined and not optional".format(field_name))

# Now use standard formatting
output = output + literal + format(value, format_spec)

# Complain if we were meant to use a component
if component is not None and not usedComponent:
raise KeyError("Component {} specified but template {} did not use it".format(component,
self.template))

# Since this is known to be a path, normalize it in case some double
# slashes have crept in
path = os.path.normpath(output)

# It should not be an absolute path (may happen with optionals)
if os.path.isabs(path):
path = os.path.relpath(path, start="/")

return path
47 changes: 37 additions & 10 deletions python/lsst/daf/butler/datastores/posixDatastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from lsst.daf.butler.core.fileDescriptor import FileDescriptor
from lsst.daf.butler.core.formatter import FormatterFactory
from lsst.daf.butler.core.storageClass import StorageClassFactory, makeNewStorageClass
from lsst.daf.butler.core.fileTemplates import FileTemplate


class PosixDatastore(Datastore):
Expand Down Expand Up @@ -81,6 +82,12 @@ def __init__(self, config):
if "formatter" in info:
self.formatterFactory.registerFormatter(name, info["formatter"])

# Read the file naming templates and store them in the datastore
# indexed by datatype name
self.templates = {}
for name, info in self.config["templates"].items():
self.templates[name] = FileTemplate(info)

def get(self, uri, storageClass, parameters=None):
"""Load an `InMemoryDataset` from the store.
Expand Down Expand Up @@ -132,7 +139,7 @@ def get(self, uri, storageClass, parameters=None):

return result

def put(self, inMemoryDataset, storageClass, storageHint, typeName=None):
def put(self, inMemoryDataset, storageClass, dataUnits, typeName=None):
"""Write a `InMemoryDataset` with a given `StorageClass` to the store.
Parameters
Expand All @@ -141,8 +148,8 @@ def put(self, inMemoryDataset, storageClass, storageHint, typeName=None):
The `Dataset` to store.
storageClass : `StorageClass`
The `StorageClass` associated with the `DatasetType`.
storageHint : `str`
Provides a hint that the `Datastore` may use as (part of) the URI.
dataUnits : `DataUnits`
DataUnits to use when constructing the filename.
typeName : `str`
The `DatasetType` name, which may be used by this `Datastore` to
override the default serialization format for the `StorageClass`.
Expand All @@ -155,7 +162,6 @@ def put(self, inMemoryDataset, storageClass, storageHint, typeName=None):
A dictionary of URIs for the `Dataset`' components.
The latter will be empty if the `Dataset` is not a composite.
"""
location = self.locationFactory.fromPath(storageHint)

# Check to see if this storage class has a disassembler
# and also has components
Expand All @@ -168,9 +174,31 @@ def put(self, inMemoryDataset, storageClass, storageHint, typeName=None):
if compTypeName is not None:
compTypeName = "{}.{}".format(compTypeName, comp)
compUris[comp], _ = self.put(info.component, info.storageClass,
location.componentUri(comp), compTypeName)
dataUnits, compTypeName)
return None, compUris

# Get a location from the templates
template = None
component = None
if typeName is not None:
if typeName in self.templates:
template = self.templates[typeName]
elif "." in typeName:
baseType, component = typeName.split(".", maxsplit=1)
if baseType in self.templates:
template = self.templates[baseType]

if template is None:
if "default" in self.templates:
template = self.templates["default"]

# if still not template give up for now.
if template is None:
raise TypeError("Unable to determine file template from supplied type [{}]".format(typeName))

location = self.locationFactory.fromPath(template.format(dataUnits,
datasettype=typeName))

# Write a single component
formatter = self.formatterFactory.getFormatter(storageClass, typeName)

Expand Down Expand Up @@ -198,7 +226,7 @@ def remove(self, uri):
raise FileNotFoundError("No such file: {0}".format(location.uri))
os.remove(location.preferredPath())

def transfer(self, inputDatastore, inputUri, storageClass, storageHint, typeName=None):
def transfer(self, inputDatastore, inputUri, storageClass, dataUnits, typeName=None):
"""Retrieve a `Dataset` with a given `URI` from an input `Datastore`,
and store the result in this `Datastore`.
Expand All @@ -210,9 +238,8 @@ def transfer(self, inputDatastore, inputUri, storageClass, storageHint, typeName
The `URI` of the `Dataset` in the input `Datastore`.
storageClass : `StorageClass`
The `StorageClass` associated with the `DatasetType`.
storageHint : `str`
Provides a hint that this `Datastore` may use as [part of] the
`URI`.
dataUnits : `DataUnits`
DataUnits to use when constructing the filename.
typeName : `str`
The `DatasetType` name, which may be used by this `Datastore`
to override the default serialization format for the `StorageClass`.
Expand All @@ -227,4 +254,4 @@ def transfer(self, inputDatastore, inputUri, storageClass, storageHint, typeName
"""
assert inputDatastore is not self # unless we want it for renames?
inMemoryDataset = inputDatastore.get(inputUri, storageClass)
return self.put(inMemoryDataset, storageClass, storageHint, typeName)
return self.put(inMemoryDataset, storageClass, dataUnits, typeName)
4 changes: 4 additions & 0 deletions tests/config/basic/butler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ datastore:
cls: lsst.daf.butler.datastores.posixDatastore.PosixDatastore
root: ./butler_test_repository
create: true
templates:
default: "{datasettype}/{tract:?}/{patch:?}/{filter:?}/{visit:?}"
calexp: "{datasettype}.{component:?}/{datasettype}_v{visit}_f{filter}_{component:?}"
metric: "{datasettype}.{component:?}/{datasettype}_v{visit:08d}_f{filter}_{component:?}"
storageClasses:
StructuredDataDictYaml:
pytype: dict
Expand Down
26 changes: 18 additions & 8 deletions tests/test_posixDatastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import lsst.utils.tests

from lsst.daf.butler.datastores.posixDatastore import PosixDatastore, DatastoreConfig
from lsst.daf.butler.core.dataUnits import DataUnits
from examplePythonTypes import MetricsExample


Expand Down Expand Up @@ -62,10 +63,13 @@ def testBasicPutGet(self):
"StructuredDataJson",
"StructuredDataPickle")]

dataUnits = DataUnits({"visit": 52, "filter": "V"})

for sc in storageClasses:
print("Using storageClass: {}".format(sc.name))
uri, comps = datastore.put(metrics, storageClass=sc, storageHint="tester_monolith.xxx",
typeName=None)
uri, comps = datastore.put(metrics, storageClass=sc,
dataUnits=dataUnits,
typeName="metric")

# Get
metricsOut = datastore.get(uri, storageClass=sc, parameters=None)
Expand Down Expand Up @@ -99,6 +103,8 @@ def testCompositePutGet(self):
metrics = makeExampleMetrics()
datastore = PosixDatastore(config=self.configFile)

dataUnits = DataUnits({"visit": 428, "filter": "R"})

# Create multiple storage classes for testing different formulations
# of composites
storageClasses = [datastore.storageClassFactory.getStorageClass(sc)
Expand All @@ -108,8 +114,9 @@ def testCompositePutGet(self):

for sc in storageClasses:
print("Using storageClass: {}".format(sc.name))
uri, comps = datastore.put(metrics, storageClass=sc, storageHint="testerc.json",
typeName=None)
uri, comps = datastore.put(metrics, storageClass=sc,
dataUnits=dataUnits,
typeName="metric")
self.assertIsNone(uri)

# Read all the components into a dict
Expand All @@ -125,8 +132,10 @@ def testRemove(self):
metrics = makeExampleMetrics()
datastore = PosixDatastore(config=self.configFile)
# Put
dataUnits = DataUnits({"visit": 638, "filter": "U"})
storageClass = datastore.storageClassFactory.getStorageClass("StructuredData")
uri, _ = datastore.put(metrics, storageClass=storageClass, storageHint="tester.json", typeName=None)
uri, _ = datastore.put(metrics, storageClass=storageClass,
dataUnits=dataUnits, typeName="metric")
# Get
metricsOut = datastore.get(uri, storageClass=storageClass, parameters=None)
self.assertEqualMetrics(metrics, metricsOut)
Expand All @@ -141,16 +150,17 @@ def testRemove(self):

def testTransfer(self):
metrics = makeExampleMetrics()
path = "tester.json"
dataUnits = DataUnits({"visit": 2048, "filter": "Uprime"})
inputConfig = DatastoreConfig(self.configFile)
inputConfig['datastore.root'] = os.path.join(self.testDir, "./test_input_datastore")
inputPosixDatastore = PosixDatastore(config=inputConfig)
outputConfig = inputConfig.copy()
outputConfig['datastore.root'] = os.path.join(self.testDir, "./test_output_datastore")
outputPosixDatastore = PosixDatastore(config=outputConfig)
storageClass = outputPosixDatastore.storageClassFactory.getStorageClass("StructuredData")
inputUri, _ = inputPosixDatastore.put(metrics, storageClass, path)
outputUri, _ = outputPosixDatastore.transfer(inputPosixDatastore, inputUri, storageClass, path)
inputUri, _ = inputPosixDatastore.put(metrics, storageClass, dataUnits, "metric")
outputUri, _ = outputPosixDatastore.transfer(inputPosixDatastore, inputUri,
storageClass, dataUnits, "metric")
metricsOut = outputPosixDatastore.get(outputUri, storageClass)
self.assertEqualMetrics(metrics, metricsOut)

Expand Down

0 comments on commit 497b9cf

Please sign in to comment.