add query_datasets CLI command

lsst · Oct 8, 2020 · 3552d3d · 3552d3d
1 parent e56455b
commit 3552d3d
Show file tree

Hide file tree

Showing 5 changed files with 317 additions and 2 deletions.
diff --git a/python/lsst/daf/butler/cli/cmd/__init__.py b/python/lsst/daf/butler/cli/cmd/__init__.py
@@ -26,7 +26,9 @@
            "prune_collection",
            "query_collections",
            "query_dataset_types",
-           "remove_dataset_type")
+           "query_datasets",
+           "remove_dataset_type",
+           )
 
 
 from .commands import (butler_import,
@@ -36,5 +38,6 @@
                        prune_collection,
                        query_collections,
                        query_dataset_types,
+                       query_datasets,
                        remove_dataset_type,
                        )
diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py
@@ -19,14 +19,17 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+from astropy.table import Table
 import click
+from collections import OrderedDict
 import yaml
 
 from ..opt import (collection_type_option, dataset_type_option, directory_argument, options_file_option,
                    glob_argument, repo_argument, transfer_option, verbose_option)
-from ..utils import cli_handle_exception, split_commas, typeStrAcceptsMultiple, unwrap
+from ..utils import cli_handle_exception, split_commas, to_upper, typeStrAcceptsMultiple, unwrap
 from ... import script
 
+
 willCreateRepoHelp = "REPO is the URI or path to the new repository. Will be created if it does not exist."
 existingRepoHelp = "REPO is the URI or path to an existing data repository root or configuration file."
 
@@ -167,3 +170,52 @@ def query_dataset_types(*args, **kwargs):
 def remove_dataset_type(*args, **kwargs):
     """Remove a dataset type definition from a repository."""
     cli_handle_exception(script.removeDatasetType, *args, **kwargs)
+
+
+@click.command()
+@repo_argument(required=True)
+@glob_argument(help="GLOB is one or more glob-style expressions that fully or partially identify the "
+                    "dataset types to be queried.")
+@click.option("--collections",
+              help=unwrap("""One or more expressions that fully or partially identify the collections to
+                          search for datasets.If not provided all datasets are returned."""),
+              multiple=True,
+              metavar=typeStrAcceptsMultiple,
+              callback=split_commas)
+@click.option("--where",
+              help=unwrap("""A string expression similar to a SQL WHERE clause. May involve any column of a
+                          dimension table or a dimension name as a shortcut for the primary key column of a
+                          dimension table."""))
+@click.option("--deduplicate",
+              is_flag=True,
+              help=unwrap("""For each result data ID, only yield one DatasetRef of each DatasetType, from the
+                          first collection in which a dataset of that dataset type appears (according to the
+                          order of `collections` passed in).  If used, `collections` must specify at least one
+                          expression and must not contain wildcards."""))
+@click.option("--components",
+              type=click.Choice(["ALL", "NONE", "UNMATCHED"], case_sensitive=False),
+              default="UNMATCHED",
+              show_default=True,
+              metavar="[ALL|NONE|UNMATCHED]",
+              callback=to_upper,
+              help=unwrap("""If UNMATCHED: apply dataset expression patterns to component dataset type names
+                          only if their parent datasets were not matched by the expression. If ALL: apply all
+                          dataset expression patterns to components. If NONE: never apply patterns to
+                          components. Fully-specified component datasets are always included."""))
+@options_file_option()
+def query_datasets(**kwargs):
+    """List the datasets in a repository."""
+    datasets = cli_handle_exception(script.queryDatasets, **kwargs)
+
+    tables = {}
+    for datasetRef in datasets:
+        rows = tables.get(datasetRef.datasetType.name, [])
+        row = OrderedDict(type=datasetRef.datasetType.name, run=datasetRef.run, id=datasetRef.id)
+        row.update(datasetRef.dataId.items())
+        rows.append(row)
+        tables[datasetRef.datasetType.name] = rows
+
+    for datasetName, rows in tables.items():
+        print("")
+        Table(rows).pprint_all()
+    print("")
diff --git a/python/lsst/daf/butler/script/__init__.py b/python/lsst/daf/butler/script/__init__.py
@@ -25,5 +25,6 @@
 from .configValidate import configValidate
 from .pruneCollection import pruneCollection
 from .queryCollections import queryCollections
+from .queryDatasets import queryDatasets
 from .queryDatasetTypes import queryDatasetTypes
 from .removeDatasetType import removeDatasetType
diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py
@@ -0,0 +1,84 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from .. import Butler
+from ..core.utils import globToRegex
+
+
+def queryDatasets(repo, glob, collections, where, deduplicate, components):
+    """Get dataset refs from a repository.
+
+    Parameters
+    ----------
+    repo : `str`
+        URI to the location of the repo or URI to a config file describing the
+        repo and its location.
+    glob : iterable [`str`]
+        A list of glob-style search string that fully or partially identify
+        the dataset type names to search for.
+    collections : iterabe [`str`]
+        A list of glob-style search string that fully or partially identify
+        the collections to search for.
+    where : `str`
+        A string expression similar to a SQL WHERE clause.  May involve any
+        column of a dimension table or (as a shortcut for the primary key
+        column of a dimension table) dimension name.
+    deduplicate : `bool`
+        For each result data ID, only yield one DatasetRef of each DatasetType,
+        from the first collection in which a dataset of that dataset type
+        appears (according to the order of `collections` passed in).  If used,
+        `collections` must specify at least one expression and must not contain
+        wildcards.
+    components : `str`
+        One of "ALL", "NONE", or "UNMATCHED". If "UNMATCHED": apply patterns to
+        components only if their parent datasets were not matched by the
+        expression. If "ALL": apply all dataset expression patterns to
+        component. If "NONE": never apply patterns to components.
+        Fully-specified component datasets are always included.
+    Returns
+    -------
+    refs : a generator `queries.DatasetQueryResults`
+        Dataset references matching the given query criteria.
+    """
+    butler = Butler(repo)
+    dataset = globToRegex(glob)
+
+    if collections and not deduplicate:
+        collections = globToRegex(collections)
+    elif not collections:
+        collections = ...
+
+    if components == "ALL":
+        components = True
+    elif components == "NONE":
+        components = False
+    elif components == "UNMATCHED":
+        components = None
+    else:
+        raise RuntimeError(f"Unrecognized value for components: {components}")
+
+    if not dataset:
+        dataset = ...
+    return butler.registry.queryDatasets(datasetType=dataset,
+                                         collections=collections,
+                                         where=where,
+                                         deduplicate=deduplicate,
+                                         components=components)
diff --git a/tests/test_cliCmdQueryDatasets.py b/tests/test_cliCmdQueryDatasets.py
@@ -0,0 +1,175 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Unit tests for daf_butler CLI query-collections command.
+"""
+
+import os
+import unittest
+
+from lsst.daf.butler import Butler, Datastore, FileDataset
+from lsst.daf.butler.cli.butler import cli
+from lsst.daf.butler.cli.utils import clickResultMsg, LogCliRunner
+
+
+TESTDIR = os.path.abspath(os.path.dirname(__file__))
+
+
+def _mock_export(refs, *,
+                 directory=None,
+                 transfer=None):
+    """A mock of `Datastore.export` that satisifies the requirement that the
+    refs passed in are included in the `FileDataset` objects returned.
+
+    This can be used to construct a `Datastore` mock that can be used in
+    repository export via::
+
+        datastore = unittest.mock.Mock(spec=Datastore)
+        datastore.export = _mock_export
+
+    """
+    for ref in refs:
+        yield FileDataset(refs=[ref],
+                          path="mock/path",
+                          formatter="lsst.daf.butler.formatters.json.JsonFormatter")
+
+
+def _mock_get(ref, parameters=None):
+    """A mock of `Datastore.get` that just returns the integer dataset ID value
+    and parameters it was given.
+    """
+    return (ref.id, parameters)
+
+
+def _splitRows(text):
+    """Transform a text table into a list of rows.
+
+    Removes empty rows. Does not attempt to remove header or divider
+    rows.
+
+    Parameters
+    ----------
+    text : `str`
+        Text that represents a text-based table.
+
+    Returns
+    -------
+    `list` [`str`]
+        The non-empty rows of the table.
+    """
+    return [line.strip() for line in text.splitlines() if line]
+
+
+class QueryDatasetsScriptTest(unittest.TestCase):
+
+    def setUp(self):
+        self.repoName = "repo"
+        self.runner = LogCliRunner()
+
+    def makeTestRepo(self):
+        # create the repo:
+        result = self.runner.invoke(cli, ["create", self.repoName])
+        self.assertEqual(result.exit_code, 0, clickResultMsg(result))
+
+        # add datasets to the registry, mock the datastore:
+        with unittest.mock.patch.object(Datastore, "fromConfig", spec=Datastore.fromConfig):
+            butler = Butler(self.repoName, writeable=True)
+            butler.datastore.export = _mock_export
+            butler.datastore.get = _mock_get
+        butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
+        butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "datasets.yaml"))
+
+    def testQueryDatasets(self):
+        with self.runner.isolated_filesystem():
+            self.makeTestRepo()
+
+            # verify that query-datasets now returns expected output:
+            result = self.runner.invoke(cli, ["query-datasets", self.repoName])
+            self.assertEqual(result.exit_code, 0, clickResultMsg(result))
+            expected = """type    run      id instrument detector physical_filter
+                        ---- ---------- --- ---------- -------- ---------------
+                        flat imported_g   4       Cam1        2          Cam1-G
+                        flat imported_g   5       Cam1        3          Cam1-G
+                        flat imported_g   6       Cam1        4          Cam1-G
+                        flat imported_r  10       Cam1        1         Cam1-R1
+                        flat imported_r  11       Cam1        2         Cam1-R1
+                        flat imported_r  12       Cam1        3         Cam1-R2
+                        flat imported_r  13       Cam1        4         Cam1-R2
+
+                        type    run      id instrument detector
+                        ---- ---------- --- ---------- --------
+                        bias imported_g   1       Cam1        1
+                        bias imported_g   2       Cam1        2
+                        bias imported_g   3       Cam1        3
+                        bias imported_r   7       Cam1        2
+                        bias imported_r   8       Cam1        3
+                        bias imported_r   9       Cam1        4"""
+            self.assertEqual(_splitRows(expected), _splitRows(result.output))
+
+            # verify the --where option
+            result = self.runner.invoke(cli, ["query-datasets", self.repoName, "flat",
+                                              "--where", "detector=2"])
+            self.assertEqual(result.exit_code, 0, clickResultMsg(result))
+            expected = """type    run      id instrument detector physical_filter
+                          ---- ---------- --- ---------- -------- ---------------
+                          flat imported_g   4       Cam1        2          Cam1-G
+                          flat imported_r  11       Cam1        2         Cam1-R1"""
+            self.assertEqual(_splitRows(expected), _splitRows(result.output))
+
+            # verify the --collections option
+            result = self.runner.invoke(cli, ["query-datasets", self.repoName, "bias",
+                                              "--collections", "imported_g,imported_r"])
+            self.assertEqual(result.exit_code, 0, clickResultMsg(result))
+            expected = """type    run      id instrument detector
+                          ---- ---------- --- ---------- --------
+                          bias imported_g   1       Cam1        1
+                          bias imported_g   2       Cam1        2
+                          bias imported_g   3       Cam1        3
+                          bias imported_r   7       Cam1        2
+                          bias imported_r   8       Cam1        3
+                          bias imported_r   9       Cam1        4"""
+            self.assertEqual(_splitRows(expected), _splitRows(result.output))
+
+            # verify the --deduplicate option
+            result = self.runner.invoke(cli, ["query-datasets", self.repoName, "bias",
+                                              "--collections", "imported_g,imported_r",
+                                              "--deduplicate"])
+            self.assertEqual(result.exit_code, 0, clickResultMsg(result))
+            expected = """type    run      id instrument detector
+                          ---- ---------- --- ---------- --------
+                          bias imported_g   1       Cam1        1
+                          bias imported_g   2       Cam1        2
+                          bias imported_g   3       Cam1        3
+                          bias imported_r   9       Cam1        4"""
+            self.assertEqual(_splitRows(expected), _splitRows(result.output))
+
+            # verify the --components option
+            result = self.runner.invoke(cli, ["query-datasets", self.repoName,
+                                              "--components", "ALL"])
+            self.assertEqual(result.exit_code, 0, clickResultMsg(result))
+            # The output for ALL is pretty long, instead of testing for the
+            # exact contents, just look for the expected number of rows
+            # (including header and separator rows).
+            self.assertEqual(len(_splitRows(result.output)), 306)
+
+
+if __name__ == "__main__":
+    unittest.main()