Skip to content

Commit

Permalink
Add ButlerRepoIndex class for handling index caching and reading.
Browse files Browse the repository at this point in the history
  • Loading branch information
timj committed Nov 18, 2021
1 parent 71e9de5 commit 913d7c7
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 28 deletions.
1 change: 1 addition & 0 deletions python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@
from ._butlerConfig import *
from ._deferredDatasetHandle import *
from ._butler import *
from ._butlerRepoIndex import *
from .transfers import YamlRepoExportBackend, YamlRepoImportBackend
from .version import *
31 changes: 17 additions & 14 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
from .core.utils import transactional
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._butlerConfig import ButlerConfig
from ._butlerRepoIndex import ButlerRepoIndex
from .registry import (
Registry,
RegistryConfig,
Expand Down Expand Up @@ -303,24 +304,26 @@ def get_repo_uri(cls, label: str) -> ButlerURI:
Notes
-----
The index of butler repositories is found by looking for a
configuration file at the URI pointed at by the environment
variable ``$BUTLER_REPOSITORY_INDEX``. The configuration file
is a simple dictionary lookup of the form:
See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
information is discovered.
"""
return ButlerRepoIndex.get_repo_uri(label)

.. code-block:: yaml
@classmethod
def get_known_repos(cls) -> Set[str]:
"""Retrieve the list of known repository labels.
label1: uri1
label2: uri2
Returns
-------
repos : `set` of `str`
All the known labels. Can be empty if no index can be found.
and can be in YAML or JSON format.
Notes
-----
See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
information is discovered.
"""
index_uri = os.environ.get("BUTLER_REPOSITORY_INDEX")
if index_uri is None:
raise KeyError("No repository index defined in enviroment variable BUTLER_REPOSITORY_INDEX.")

config = Config(index_uri)
return config[label]
return ButlerRepoIndex.get_known_repos()

@staticmethod
def makeRepo(root: str, config: Union[Config, str, None] = None,
Expand Down
173 changes: 173 additions & 0 deletions python/lsst/daf/butler/_butlerRepoIndex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

__all__ = ("ButlerRepoIndex",)

import os

from typing import (
ClassVar,
Dict,
Set,
)

from .core import (
ButlerURI,
Config,
)


class ButlerRepoIndex:
"""Index of all known butler repositories.
The index of butler repositories is found by looking for a
configuration file at the URI pointed at by the environment
variable ``$BUTLER_REPOSITORY_INDEX``. The configuration file
is a simple dictionary lookup of the form:
.. code-block:: yaml
label1: uri1
label2: uri2
and can be in YAML or JSON format. The content of the file will be
cached.
"""

index_env_var: ClassVar[str] = "BUTLER_REPOSITORY_INDEX"
"""The name of the environment variable to read to locate the index."""

_cache: ClassVar[Dict[ButlerURI, Config]] = {}
"""Cache of indexes. In most scenarios only one index will be found
and the environment will not change. In tests this may not be true."""

@classmethod
def _read_repository_index(cls, index_uri: ButlerURI) -> Config:
"""Read the repository index from the supplied URI.
Parameters
----------
index_uri : `str` or `ButlerURI`
URI of the repository index.
Returns
-------
repo_index : `Config`
The index found at this URI.
Raises
------
FileNotFoundError
Raised if the URI does not exist.
Notes
-----
Does check the cache before reading the file.
"""
if index_uri in cls._cache:
return cls._cache[index_uri]

repo_index = Config(index_uri)
cls._cache[index_uri] = repo_index

return repo_index

@classmethod
def _get_index_uri(cls) -> ButlerURI:
"""Find the URI to the repository index.
Returns
-------
index_uri : `ButlerURI`
URI to the repository index.
Raises
------
KeyError
Raised if the location of the index could not be determined.
"""
index_uri = os.environ.get(cls.index_env_var)
if index_uri is None:
raise KeyError(f"No repository index defined in enviroment variable {cls.index_env_var}")
return ButlerURI(index_uri)

@classmethod
def _read_repository_index_from_environment(cls) -> Config:
"""Look in environment for index location and read it.
Returns
-------
repo_index : `Config`
The index found in the environment.
"""
index_uri = cls._get_index_uri()
return cls._read_repository_index(index_uri)

@classmethod
def get_known_repos(cls) -> Set[str]:
"""Retrieve the list of known repository labels.
Returns
-------
repos : `set` of `str`
All the known labels. Can be empty if no index can be found.
"""
try:
repo_index = cls._read_repository_index_from_environment()
except (FileNotFoundError, KeyError):
return set()
return set(repo_index)

@classmethod
def get_repo_uri(cls, label: str) -> ButlerURI:
"""Look up the label in a butler repository index.
Parameters
----------
label : `str`
Label of the Butler repository to look up.
Returns
-------
uri : `ButlerURI`
URI to the Butler repository associated with the given label.
Raises
------
KeyError
Raised if the label is not found in the index, or if an index
can not be found at all.
FileNotFoundError
Raised if an index is defined in the environment but it
can not be found.
"""
repo_index = cls._read_repository_index_from_environment()
repo_uri = repo_index.get(label)
if repo_uri is None:
# This should not raise since it worked earlier.
try:
index_uri = str(cls._get_index_uri())
except KeyError:
index_uri = "<environment variable not defined>"
raise KeyError(f"Label '{label}' not known to repository index at {index_uri}")
return ButlerURI(repo_uri)
37 changes: 23 additions & 14 deletions tests/test_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,21 +486,30 @@ def testConstructor(self):
# repository.
butler_index = Config()
butler_index["label"] = self.tmpConfigFile
butler_index.dump
with ButlerURI.temporary_uri(suffix=".yaml") as temp_file:
butler_index.dumpToUri(temp_file)
with unittest.mock.patch.dict(os.environ, {"BUTLER_REPOSITORY_INDEX": str(temp_file)}):
uri = Butler.get_repo_uri("label")
butler = Butler(uri, writeable=False)
self.assertIsInstance(butler, Butler)
with self.assertRaises(KeyError):
Butler.get_repo_uri("missing")
with unittest.mock.patch.dict(os.environ, {"BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
with self.assertRaises(FileNotFoundError):
Butler.get_repo_uri("label")
with self.assertRaises(KeyError):
# No environment variable set.
for suffix in (".yaml", ".json"):
# Ensure that the content differs so that we know that
# we aren't reusing the cache.
bad_label = f"s3://bucket/not_real{suffix}"
butler_index["bad_label"] = bad_label
with ButlerURI.temporary_uri(suffix=suffix) as temp_file:
butler_index.dumpToUri(temp_file)
with unittest.mock.patch.dict(os.environ, {"BUTLER_REPOSITORY_INDEX": str(temp_file)}):
self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
uri = Butler.get_repo_uri("bad_label")
self.assertEqual(uri, ButlerURI(bad_label))
uri = Butler.get_repo_uri("label")
butler = Butler(uri, writeable=False)
self.assertIsInstance(butler, Butler)
with self.assertRaises(KeyError) as cm:
Butler.get_repo_uri("missing")
self.assertIn("not known to", str(cm.exception))
with unittest.mock.patch.dict(os.environ, {"BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
with self.assertRaises(FileNotFoundError):
Butler.get_repo_uri("label")
with self.assertRaises(KeyError) as cm:
# No environment variable set.
Butler.get_repo_uri("label")
self.assertIn("No repository index defined", str(cm.exception))

def testBasicPutGet(self):
storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
Expand Down

0 comments on commit 913d7c7

Please sign in to comment.