Skip to content

Commit

Permalink
Review fixes #4.
Browse files Browse the repository at this point in the history
 - added JSON from and to bytes methods.
 - fixed all the flake8 errors.
 - wrote tests for s3utils.
 - rewrote parsedPathToUriElements to make more sense.
   Added examples, better documentation, split on root
   capability
 - refactored how read/write to file methods work for
   formatters. No more serialization code duplication.
 - Fix in ingest S3Datastore functionality and path-uri
   changes made it possible to kick out the duplicate
   path removing code from S3LocationFactory.
  • Loading branch information
DinoBektesevic committed May 17, 2019
1 parent 8429268 commit 4e88b7d
Show file tree
Hide file tree
Showing 17 changed files with 442 additions and 318 deletions.
77 changes: 70 additions & 7 deletions python/lsst/daf/butler/butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
import contextlib
import logging
import itertools
import tempfile

try:
import boto3
Expand Down Expand Up @@ -60,9 +59,9 @@ class ButlerValidationError(ValidationError):
pass



class Butler:
"""Main entry point for the data access system.
Attributes
----------
config : `str`, `ButlerConfig` or `Config`, optional
Expand All @@ -73,6 +72,7 @@ class Butler:
Datastore to use for storage.
registry : `Registry`
Registry to use for lookups.
Parameters
----------
config : `ButlerConfig`, `Config` or `str`, optional.
Expand All @@ -93,6 +93,7 @@ class Butler:
Directory paths to search when calculating the full Butler
configuration. Not used if the supplied config is already a
`ButlerConfig`.
Raises
------
ValueError
Expand All @@ -102,6 +103,7 @@ class Butler:

GENERATION = 3
"""This is a Generation 3 Butler.
This attribute may be removed in the future, once the Generation 2 Butler
interface has been fully retired; it should only be used in transitional
code.
Expand All @@ -110,6 +112,67 @@ class Butler:
@staticmethod
def makeRepo(root, config=None, standalone=False, createRegistry=True, searchPaths=None,
forceConfigRoot=True, outfile=None):
"""Create an empty data repository by adding a butler.yaml config
to a repository root directory.
Parameters
----------
root : `str`
Filesystem path to the root of the new repository. Will be created
if it does not exist.
config : `Config` or `str`, optional
Configuration to write to the repository, after setting any
root-dependent Registry or Datastore config options. Can not
be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
configuration will be used. Root-dependent config options
specified in this config are overwritten if ``forceConfigRoot``
is `True`.
standalone : `bool`
If True, write all expanded defaults, not just customized or
repository-specific settings.
This (mostly) decouples the repository from the default
configuration, insulating it from changes to the defaults (which
may be good or bad, depending on the nature of the changes).
Future *additions* to the defaults will still be picked up when
initializing `Butlers` to repos created with ``standalone=True``.
createRegistry : `bool`, optional
If `True` create a new Registry.
searchPaths : `list` of `str`, optional
Directory paths to search when calculating the full butler
configuration.
forceConfigRoot : `bool`, optional
If `False`, any values present in the supplied ``config`` that
would normally be reset are not overridden and will appear
directly in the output config. This allows non-standard overrides
of the root directory for a datastore or registry to be given.
If this parameter is `True` the values for ``root`` will be
forced into the resulting config if appropriate.
outfile : `str`, optional
If not-`None`, the output configuration will be written to this
location rather than into the repository itself.
Returns
-------
config : `Config`
The updated `Config` instance written to the repo.
Raises
------
ValueError
Raised if a ButlerConfig or ConfigSubset is passed instead of a
regular Config (as these subclasses would make it impossible to
support ``standalone=False``).
os.error
Raised if the directory does not exist, exists but is not a
directory, or cannot be created.
Notes
-----
Note that when ``standalone=False`` (the default), the configuration
search path (see `ConfigSubset.defaultSearchPaths`) that was used to
construct the repository should also be used to construct any Butlers
to avoid configuration inconsistencies.
"""
if isinstance(config, (ButlerConfig, ConfigSubset)):
raise ValueError("makeRepo must be passed a regular Config without defaults applied.")

Expand All @@ -127,7 +190,7 @@ def makeRepo(root, config=None, standalone=False, createRegistry=True, searchPat
bucket.put_object(Bucket=rootpath, Key=(relpath))
config = Config(config)

# If we are creating a new repo from scratch with relative roots
# If we are creating a new repo from scratch with relative roots,
# do not propagate an explicit root from the config file
if "root" in config:
del config["root"]
Expand All @@ -148,10 +211,10 @@ def makeRepo(root, config=None, standalone=False, createRegistry=True, searchPat
registryClass.fromConfig(config, create=createRegistry, butlerRoot=root)
return config

def __init__(self, config=None, collection=None, run=None):
def __init__(self, config=None, collection=None, run=None, searchPaths=None):
# save arguments for pickling
self._args = (config, collection, run)
self.config = ButlerConfig(config)
self.config = ButlerConfig(config, searchPaths=searchPaths)

if "root" in self.config:
butlerRoot = self.config["root"]
Expand Down Expand Up @@ -592,12 +655,12 @@ def validateConfiguration(self, logFailures=False, datasetTypeNames=None, ignore
instrumentEntries = self.registry.findDimensionEntries("instrument")
instruments = {e["instrument"] for e in instrumentEntries}

# For each datasetType that has an Instrument dimension, create
# For each datasetType that has an instrument dimension, create
# a DatasetRef for each defined instrument
datasetRefs = []

for datasetType in entities:
if "Instrument" in datasetType.dimensions:
if "instrument" in datasetType.dimensions:
for instrument in instruments:
datasetRef = DatasetRef(datasetType, {"instrument": instrument})
datasetRefs.append(datasetRef)
Expand Down
17 changes: 12 additions & 5 deletions python/lsst/daf/butler/core/butlerConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import os.path

from . import utils
from .s3utils import parsePathToUriElements
from .config import Config
from .datastore import DatastoreConfig
from .schema import SchemaConfig
Expand All @@ -42,19 +43,25 @@

class ButlerConfig(Config):
"""Contains the configuration for a `Butler`
The configuration is read and merged with default configurations for
the particular classes. The defaults are read according to the rules
outlined in `ConfigSubset`. Each component of the configuration associated
with a configuration class reads its own defaults.
Parameters
----------
other : `str`, `Config`, optional
Path to butler configuration YAML file or a directory containing a
"butler.yaml" file. If `None` the butler will
be configured based entirely on defaults read from the environment.
be configured based entirely on defaults read from the environment
or from ``searchPaths``.
No defaults will be read if a `ButlerConfig` is supplied directly.
searchPaths : `list` or `tuple`, optional
Explicit additional paths to search for defaults. They should
be supplied in priority order. These paths have higher priority
than those read from the environment in
`ConfigSubset.defaultSearchPaths()`. They are only read if ``other``
refers to a configuration file or directory.
"""

def __init__(self, other=None, searchPaths=None):
Expand All @@ -71,10 +78,10 @@ def __init__(self, other=None, searchPaths=None):

if isinstance(other, str):
if other.startswith('s3://') and not other.endswith('.yaml'):
scheme, root, relpath = utils.parsePathToUriElements(other)
scheme, root, relpath = parsePathToUriElements(other)
other = scheme + os.path.join(root, relpath, "butler.yaml")
else:
if os.path.isdir(other):
if os.path.isdir(other):
other = os.path.join(other, "butler.yaml")

# Create an empty config for us to populate
Expand Down
3 changes: 1 addition & 2 deletions python/lsst/daf/butler/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,11 @@
import io
from yaml.representer import Representer

from . import utils
from .s3utils import parsePathToUriElements

try:
import boto3
except:
except ImportError:
boto3 = None

import lsst.utils
Expand Down
1 change: 1 addition & 0 deletions python/lsst/daf/butler/core/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ class DatasetRef:

def __init__(self, datasetType, dataId, id=None, run=None, hash=None, components=None):
assert isinstance(datasetType, DatasetType)

# Check the dimensions match if a DataId is provided
if isinstance(dataId, DataId) and isinstance(datasetType.dimensions, DimensionGraph):
if dataId.dimensions() != datasetType.dimensions:
Expand Down
44 changes: 4 additions & 40 deletions python/lsst/daf/butler/core/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,9 @@ def uri(self):
def bucket(self):
"""Return the bucketname of this S3Location.
"""
# buckets are special because you only want their name, but path.joining them will
# not understand its relationship to rootDir without it.
# buckets are special because you only want their name, but
# path.join will not understand their relationship to rootDir
# without the ending /
return self._bucket.strip('/')

@property
Expand Down Expand Up @@ -194,7 +195,6 @@ def updateExtension(self, ext):
self._relpath = path + ext



class S3LocationFactory:
"""Factory for `S3Location` instances.
"""
Expand Down Expand Up @@ -254,40 +254,4 @@ def fromPath(self, path):
raise ValueError(('A path whose absolute location is in an S3 bucket '
'can not have an absolute path: {}').format(path))

#if path.startswith('s3://') or path.startswith(self._bucket):


# sometimes people give relative path with the datastoreRoot
# and sometimes they give the pathInStore type path. Sometimes the bucket
# is provided as well. This can duplicate the datastoreRoot and bucket
# which we remove
dirs = path.lstrip('/').split('/')
nEqual = 0
# we shouldn't remove a sub-dir in the bucket that can have the of the same name
# so we require that bucketname/rootdir order is respected. A nicer way of writing
# this would be nice
for d in dirs:
if (d==self._bucket):
nEqual += 1
else:
break

for d in dirs[nEqual:]:
if (d==self._datastoreRoot):
nEqual += 1
else:
break
path = os.path.join(*dirs[nEqual:])

if nEqual == 0:
# path does not contain the rootdir or bucket
return self.fromUri('s3://' + os.path.join(self._bucket, self._datastoreRoot, path))
elif nEqual == 1:
# path contains the root dir or bucket
return self.fromUri('s3://' + os.path.join(self._bucket, path))
elif nEqual >1:
# path probably contains multiple datastoreRoots and/or buckets, or is completely broken
return self.fromUri('s3://'+ os.path.join(self._bucket, self._datastoreRoot, path))
else:
# an act of god
raise OSError("Don't understand path at all: {}".format(path))
return self.fromUri('s3://' + os.path.join(self._bucket, self._datastoreRoot, path))

0 comments on commit 4e88b7d

Please sign in to comment.