Skip to content

Commit

Permalink
add query_datasets CLI command
Browse files Browse the repository at this point in the history
  • Loading branch information
n8pease committed Oct 8, 2020
1 parent e56455b commit 3552d3d
Show file tree
Hide file tree
Showing 5 changed files with 317 additions and 2 deletions.
5 changes: 4 additions & 1 deletion python/lsst/daf/butler/cli/cmd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
"prune_collection",
"query_collections",
"query_dataset_types",
"remove_dataset_type")
"query_datasets",
"remove_dataset_type",
)


from .commands import (butler_import,
Expand All @@ -36,5 +38,6 @@
prune_collection,
query_collections,
query_dataset_types,
query_datasets,
remove_dataset_type,
)
54 changes: 53 additions & 1 deletion python/lsst/daf/butler/cli/cmd/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,17 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from astropy.table import Table
import click
from collections import OrderedDict
import yaml

from ..opt import (collection_type_option, dataset_type_option, directory_argument, options_file_option,
glob_argument, repo_argument, transfer_option, verbose_option)
from ..utils import cli_handle_exception, split_commas, typeStrAcceptsMultiple, unwrap
from ..utils import cli_handle_exception, split_commas, to_upper, typeStrAcceptsMultiple, unwrap
from ... import script


willCreateRepoHelp = "REPO is the URI or path to the new repository. Will be created if it does not exist."
existingRepoHelp = "REPO is the URI or path to an existing data repository root or configuration file."

Expand Down Expand Up @@ -167,3 +170,52 @@ def query_dataset_types(*args, **kwargs):
def remove_dataset_type(*args, **kwargs):
"""Remove a dataset type definition from a repository."""
cli_handle_exception(script.removeDatasetType, *args, **kwargs)


@click.command()
@repo_argument(required=True)
@glob_argument(help="GLOB is one or more glob-style expressions that fully or partially identify the "
"dataset types to be queried.")
@click.option("--collections",
help=unwrap("""One or more expressions that fully or partially identify the collections to
search for datasets.If not provided all datasets are returned."""),
multiple=True,
metavar=typeStrAcceptsMultiple,
callback=split_commas)
@click.option("--where",
help=unwrap("""A string expression similar to a SQL WHERE clause. May involve any column of a
dimension table or a dimension name as a shortcut for the primary key column of a
dimension table."""))
@click.option("--deduplicate",
is_flag=True,
help=unwrap("""For each result data ID, only yield one DatasetRef of each DatasetType, from the
first collection in which a dataset of that dataset type appears (according to the
order of `collections` passed in). If used, `collections` must specify at least one
expression and must not contain wildcards."""))
@click.option("--components",
type=click.Choice(["ALL", "NONE", "UNMATCHED"], case_sensitive=False),
default="UNMATCHED",
show_default=True,
metavar="[ALL|NONE|UNMATCHED]",
callback=to_upper,
help=unwrap("""If UNMATCHED: apply dataset expression patterns to component dataset type names
only if their parent datasets were not matched by the expression. If ALL: apply all
dataset expression patterns to components. If NONE: never apply patterns to
components. Fully-specified component datasets are always included."""))
@options_file_option()
def query_datasets(**kwargs):
"""List the datasets in a repository."""
datasets = cli_handle_exception(script.queryDatasets, **kwargs)

tables = {}
for datasetRef in datasets:
rows = tables.get(datasetRef.datasetType.name, [])
row = OrderedDict(type=datasetRef.datasetType.name, run=datasetRef.run, id=datasetRef.id)
row.update(datasetRef.dataId.items())
rows.append(row)
tables[datasetRef.datasetType.name] = rows

for datasetName, rows in tables.items():
print("")
Table(rows).pprint_all()
print("")
1 change: 1 addition & 0 deletions python/lsst/daf/butler/script/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,6 @@
from .configValidate import configValidate
from .pruneCollection import pruneCollection
from .queryCollections import queryCollections
from .queryDatasets import queryDatasets
from .queryDatasetTypes import queryDatasetTypes
from .removeDatasetType import removeDatasetType
84 changes: 84 additions & 0 deletions python/lsst/daf/butler/script/queryDatasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from .. import Butler
from ..core.utils import globToRegex


def queryDatasets(repo, glob, collections, where, deduplicate, components):
"""Get dataset refs from a repository.
Parameters
----------
repo : `str`
URI to the location of the repo or URI to a config file describing the
repo and its location.
glob : iterable [`str`]
A list of glob-style search string that fully or partially identify
the dataset type names to search for.
collections : iterabe [`str`]
A list of glob-style search string that fully or partially identify
the collections to search for.
where : `str`
A string expression similar to a SQL WHERE clause. May involve any
column of a dimension table or (as a shortcut for the primary key
column of a dimension table) dimension name.
deduplicate : `bool`
For each result data ID, only yield one DatasetRef of each DatasetType,
from the first collection in which a dataset of that dataset type
appears (according to the order of `collections` passed in). If used,
`collections` must specify at least one expression and must not contain
wildcards.
components : `str`
One of "ALL", "NONE", or "UNMATCHED". If "UNMATCHED": apply patterns to
components only if their parent datasets were not matched by the
expression. If "ALL": apply all dataset expression patterns to
component. If "NONE": never apply patterns to components.
Fully-specified component datasets are always included.
Returns
-------
refs : a generator `queries.DatasetQueryResults`
Dataset references matching the given query criteria.
"""
butler = Butler(repo)
dataset = globToRegex(glob)

if collections and not deduplicate:
collections = globToRegex(collections)
elif not collections:
collections = ...

if components == "ALL":
components = True
elif components == "NONE":
components = False
elif components == "UNMATCHED":
components = None
else:
raise RuntimeError(f"Unrecognized value for components: {components}")

if not dataset:
dataset = ...
return butler.registry.queryDatasets(datasetType=dataset,
collections=collections,
where=where,
deduplicate=deduplicate,
components=components)
175 changes: 175 additions & 0 deletions tests/test_cliCmdQueryDatasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Unit tests for daf_butler CLI query-collections command.
"""

import os
import unittest

from lsst.daf.butler import Butler, Datastore, FileDataset
from lsst.daf.butler.cli.butler import cli
from lsst.daf.butler.cli.utils import clickResultMsg, LogCliRunner


TESTDIR = os.path.abspath(os.path.dirname(__file__))


def _mock_export(refs, *,
directory=None,
transfer=None):
"""A mock of `Datastore.export` that satisifies the requirement that the
refs passed in are included in the `FileDataset` objects returned.
This can be used to construct a `Datastore` mock that can be used in
repository export via::
datastore = unittest.mock.Mock(spec=Datastore)
datastore.export = _mock_export
"""
for ref in refs:
yield FileDataset(refs=[ref],
path="mock/path",
formatter="lsst.daf.butler.formatters.json.JsonFormatter")


def _mock_get(ref, parameters=None):
"""A mock of `Datastore.get` that just returns the integer dataset ID value
and parameters it was given.
"""
return (ref.id, parameters)


def _splitRows(text):
"""Transform a text table into a list of rows.
Removes empty rows. Does not attempt to remove header or divider
rows.
Parameters
----------
text : `str`
Text that represents a text-based table.
Returns
-------
`list` [`str`]
The non-empty rows of the table.
"""
return [line.strip() for line in text.splitlines() if line]


class QueryDatasetsScriptTest(unittest.TestCase):

def setUp(self):
self.repoName = "repo"
self.runner = LogCliRunner()

def makeTestRepo(self):
# create the repo:
result = self.runner.invoke(cli, ["create", self.repoName])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))

# add datasets to the registry, mock the datastore:
with unittest.mock.patch.object(Datastore, "fromConfig", spec=Datastore.fromConfig):
butler = Butler(self.repoName, writeable=True)
butler.datastore.export = _mock_export
butler.datastore.get = _mock_get
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "datasets.yaml"))

def testQueryDatasets(self):
with self.runner.isolated_filesystem():
self.makeTestRepo()

# verify that query-datasets now returns expected output:
result = self.runner.invoke(cli, ["query-datasets", self.repoName])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
expected = """type run id instrument detector physical_filter
---- ---------- --- ---------- -------- ---------------
flat imported_g 4 Cam1 2 Cam1-G
flat imported_g 5 Cam1 3 Cam1-G
flat imported_g 6 Cam1 4 Cam1-G
flat imported_r 10 Cam1 1 Cam1-R1
flat imported_r 11 Cam1 2 Cam1-R1
flat imported_r 12 Cam1 3 Cam1-R2
flat imported_r 13 Cam1 4 Cam1-R2
type run id instrument detector
---- ---------- --- ---------- --------
bias imported_g 1 Cam1 1
bias imported_g 2 Cam1 2
bias imported_g 3 Cam1 3
bias imported_r 7 Cam1 2
bias imported_r 8 Cam1 3
bias imported_r 9 Cam1 4"""
self.assertEqual(_splitRows(expected), _splitRows(result.output))

# verify the --where option
result = self.runner.invoke(cli, ["query-datasets", self.repoName, "flat",
"--where", "detector=2"])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
expected = """type run id instrument detector physical_filter
---- ---------- --- ---------- -------- ---------------
flat imported_g 4 Cam1 2 Cam1-G
flat imported_r 11 Cam1 2 Cam1-R1"""
self.assertEqual(_splitRows(expected), _splitRows(result.output))

# verify the --collections option
result = self.runner.invoke(cli, ["query-datasets", self.repoName, "bias",
"--collections", "imported_g,imported_r"])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
expected = """type run id instrument detector
---- ---------- --- ---------- --------
bias imported_g 1 Cam1 1
bias imported_g 2 Cam1 2
bias imported_g 3 Cam1 3
bias imported_r 7 Cam1 2
bias imported_r 8 Cam1 3
bias imported_r 9 Cam1 4"""
self.assertEqual(_splitRows(expected), _splitRows(result.output))

# verify the --deduplicate option
result = self.runner.invoke(cli, ["query-datasets", self.repoName, "bias",
"--collections", "imported_g,imported_r",
"--deduplicate"])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
expected = """type run id instrument detector
---- ---------- --- ---------- --------
bias imported_g 1 Cam1 1
bias imported_g 2 Cam1 2
bias imported_g 3 Cam1 3
bias imported_r 9 Cam1 4"""
self.assertEqual(_splitRows(expected), _splitRows(result.output))

# verify the --components option
result = self.runner.invoke(cli, ["query-datasets", self.repoName,
"--components", "ALL"])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
# The output for ALL is pretty long, instead of testing for the
# exact contents, just look for the expected number of rows
# (including header and separator rows).
self.assertEqual(len(_splitRows(result.output)), 306)


if __name__ == "__main__":
unittest.main()

0 comments on commit 3552d3d

Please sign in to comment.