Merge branch 'tickets/DM-26685' into master

lsst · Oct 21, 2020 · b5a4b20 · b5a4b20
2 parents c8cc49c + 70029ae
commit b5a4b20
Show file tree

Hide file tree

Showing 5 changed files with 564 additions and 12 deletions.
diff --git a/python/lsst/daf/butler/cli/cmd/__init__.py b/python/lsst/daf/butler/cli/cmd/__init__.py
@@ -26,7 +26,9 @@
            "prune_collection",
            "query_collections",
            "query_dataset_types",
-           "remove_dataset_type")
+           "query_datasets",
+           "remove_dataset_type",
+           )
 
 
 from .commands import (butler_import,
@@ -36,5 +38,6 @@
                        prune_collection,
                        query_collections,
                        query_dataset_types,
+                       query_datasets,
                        remove_dataset_type,
                        )
diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py
@@ -25,8 +25,8 @@
 from ..opt import (collection_type_option, dataset_type_option, directory_argument, options_file_option,
                    glob_argument, repo_argument, transfer_option, verbose_option)
 from ..utils import cli_handle_exception, split_commas, typeStrAcceptsMultiple, unwrap
-from ...script import (butlerImport, createRepo, configDump, configValidate, pruneCollection,
-                       queryCollections, queryDatasetTypes, removeDatasetType)
+from ... import script
+
 
 willCreateRepoHelp = "REPO is the URI or path to the new repository. Will be created if it does not exist."
 existingRepoHelp = "REPO is the URI or path to an existing data repository root or configuration file."
@@ -53,7 +53,7 @@
 @options_file_option()
 def butler_import(*args, **kwargs):
     """Import data into a butler repository."""
-    cli_handle_exception(butlerImport, *args, **kwargs)
+    cli_handle_exception(script.butlerImport, *args, **kwargs)
 
 
 @click.command()
@@ -68,7 +68,7 @@ def butler_import(*args, **kwargs):
 @options_file_option()
 def create(*args, **kwargs):
     """Create an empty Gen3 Butler repository."""
-    cli_handle_exception(createRepo, *args, **kwargs)
+    cli_handle_exception(script.createRepo, *args, **kwargs)
 
 
 @click.command(short_help="Dump butler config to stdout.")
@@ -85,7 +85,7 @@ def create(*args, **kwargs):
 @options_file_option()
 def config_dump(*args, **kwargs):
     """Dump either a subset or full Butler configuration to standard output."""
-    cli_handle_exception(configDump, *args, **kwargs)
+    cli_handle_exception(script.configDump, *args, **kwargs)
 
 
 @click.command(short_help="Validate the configuration files.")
@@ -98,7 +98,7 @@ def config_dump(*args, **kwargs):
 @options_file_option()
 def config_validate(*args, **kwargs):
     """Validate the configuration files for a Gen3 Butler repository."""
-    is_good = cli_handle_exception(configValidate, *args, **kwargs)
+    is_good = cli_handle_exception(script.configValidate, *args, **kwargs)
     if not is_good:
         raise click.exceptions.Exit(1)
 
@@ -121,7 +121,7 @@ def config_validate(*args, **kwargs):
 @options_file_option()
 def prune_collection(**kwargs):
     """Remove a collection and possibly prune datasets within it."""
-    cli_handle_exception(pruneCollection, **kwargs)
+    cli_handle_exception(script.pruneCollection, **kwargs)
 
 
 @click.command(short_help="Search for collections.")
@@ -141,7 +141,7 @@ def prune_collection(**kwargs):
 @options_file_option()
 def query_collections(*args, **kwargs):
     """Get the collections whose names match an expression."""
-    print(yaml.dump(cli_handle_exception(queryCollections, *args, **kwargs)))
+    print(yaml.dump(cli_handle_exception(script.queryCollections, *args, **kwargs)))
 
 
 @click.command()
@@ -159,12 +159,46 @@ def query_collections(*args, **kwargs):
 @options_file_option()
 def query_dataset_types(*args, **kwargs):
     """Get the dataset types in a repository."""
-    print(yaml.dump(cli_handle_exception(queryDatasetTypes, *args, **kwargs), sort_keys=False))
+    print(yaml.dump(cli_handle_exception(script.queryDatasetTypes, *args, **kwargs), sort_keys=False))
 
 
 @click.command()
 @repo_argument(required=True)
 @click.argument('dataset-type-name', nargs=1)
 def remove_dataset_type(*args, **kwargs):
-    """Remove a dataset type definition from a reopsitory."""
-    cli_handle_exception(removeDatasetType, *args, **kwargs)
+    """Remove a dataset type definition from a repository."""
+    cli_handle_exception(script.removeDatasetType, *args, **kwargs)
+
+
+@click.command()
+@repo_argument(required=True)
+@glob_argument(help="GLOB is one or more glob-style expressions that fully or partially identify the "
+                    "dataset types to be queried.")
+@click.option("--collections",
+              help=unwrap("""One or more expressions that fully or partially identify the collections to
+                          search for datasets.If not provided all datasets are returned."""),
+              multiple=True,
+              metavar=typeStrAcceptsMultiple,
+              callback=split_commas)
+@click.option("--where",
+              help=unwrap("""A string expression similar to a SQL WHERE clause. May involve any column of a
+                          dimension table or a dimension name as a shortcut for the primary key column of a
+                          dimension table."""))
+@click.option("--find-first",
+              is_flag=True,
+              help=unwrap("""For each result data ID, only yield one DatasetRef of each DatasetType, from the
+                          first collection in which a dataset of that dataset type appears (according to the
+                          order of 'collections' passed in).  If used, 'collections' must specify at least one
+                          expression and must not contain wildcards."""))
+@click.option("--show-uri",
+              is_flag=True,
+              help="Show the dataset URI in results.")
+@options_file_option()
+def query_datasets(**kwargs):
+    """List the datasets in a repository."""
+    tables = cli_handle_exception(script.queryDatasets, **kwargs)
+
+    for table in tables:
+        print("")
+        table.pprint_all()
+    print("")
diff --git a/python/lsst/daf/butler/script/__init__.py b/python/lsst/daf/butler/script/__init__.py
@@ -25,5 +25,6 @@
 from .configValidate import configValidate
 from .pruneCollection import pruneCollection
 from .queryCollections import queryCollections
+from .queryDatasets import queryDatasets
 from .queryDatasetTypes import queryDatasetTypes
 from .removeDatasetType import removeDatasetType
diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py
@@ -0,0 +1,158 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from astropy.table import Table as AstropyTable
+from collections import defaultdict, namedtuple
+from numpy import array
+
+from .. import Butler
+from ..core.utils import globToRegex
+
+_RefInfo = namedtuple("RefInfo", "datasetRef uri")
+
+
+class _Table:
+    """Aggregates rows for a single dataset type, and creates an astropy table
+    with the aggregated data. Eliminates duplicate rows.
+
+    Parameters
+    ----------
+    columnNames : `list` [`str`]
+        The names of columns.
+    """
+
+    def __init__(self):
+        self.datasetRefs = set()
+
+    def add(self, datasetRef, uri=None):
+        """Add a row of information to the table.
+
+        ``uri`` is optional but must be the consistent; provided or not, for
+        every call to a ``_Table`` instance.
+
+        Parameters
+        ----------
+        datasetRef : ``DatasetRef``
+            A dataset ref that will be added as a row in the table.
+        uri : ``ButlerURI``, optional
+            The URI to show as a file location in the table, by default None
+        """
+        if uri:
+            uri = str(uri)
+        self.datasetRefs.add(_RefInfo(datasetRef, uri))
+
+    def getAstropyTable(self, datasetTypeName):
+        """Get the table as an astropy table.
+
+        Parameters
+        ----------
+        datasetTypeName : `str`
+            The dataset type name to show in the ``type`` column of the table.
+
+        Returns
+        -------
+        table : `astropy.table._Table`
+            The table with the provided column names and rows.
+        """
+        # Should never happen; adding a dataset should be the action that
+        # causes a _Table to be created.
+        if not self.datasetRefs:
+            raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}")
+
+        refInfo = next(iter(self.datasetRefs))
+        columnNames = ["type", "run", "id",
+                       *[str(item) for item in refInfo.datasetRef.dataId.keys()]]
+        if refInfo.uri:
+            columnNames.append("URI")
+
+        rows = []
+        for refInfo in sorted(self.datasetRefs):
+            row = [datasetTypeName,
+                   refInfo.datasetRef.run,
+                   refInfo.datasetRef.id,
+                   *[str(value) for value in refInfo.datasetRef.dataId.values()]]
+            if refInfo.uri:
+                row.append(refInfo.uri)
+            rows.append(row)
+
+        return AstropyTable(array(rows), names=columnNames)
+
+
+def queryDatasets(repo, glob, collections, where, find_first, show_uri):
+    """Get dataset refs from a repository.
+
+    Parameters
+    ----------
+    repo : `str`
+        URI to the location of the repo or URI to a config file describing the
+        repo and its location.
+    glob : iterable [`str`]
+        A list of glob-style search string that fully or partially identify
+        the dataset type names to search for.
+    collections : iterable [`str`]
+        A list of glob-style search string that fully or partially identify
+        the collections to search for.
+    where : `str`
+        A string expression similar to a SQL WHERE clause.  May involve any
+        column of a dimension table or (as a shortcut for the primary key
+        column of a dimension table) dimension name.
+    find_first : `bool`
+        For each result data ID, only yield one DatasetRef of each DatasetType,
+        from the first collection in which a dataset of that dataset type
+        appears (according to the order of `collections` passed in).  If used,
+        `collections` must specify at least one expression and must not contain
+        wildcards.
+    show_uri : `bool`
+        If True, include the dataset URI in the output.
+    Returns
+    -------
+    datasetTables : `list` [``astropy.table._Table``]
+        A list of astropy tables, one for each dataset type.
+    """
+    butler = Butler(repo)
+
+    dataset = globToRegex(glob)
+    if not dataset:
+        dataset = ...
+
+    if collections and not find_first:
+        collections = globToRegex(collections)
+    elif not collections:
+        collections = ...
+
+    datasets = butler.registry.queryDatasets(datasetType=dataset,
+                                             collections=collections,
+                                             where=where,
+                                             deduplicate=find_first)
+
+    tables = defaultdict(_Table)
+
+    for datasetRef in datasets:
+        if not show_uri:
+            tables[datasetRef.datasetType.name].add(datasetRef)
+        else:
+            primaryURI, componentURIs = butler.getURIs(datasetRef, collections=datasetRef.run)
+            if primaryURI:
+                tables[datasetRef.datasetType.name].add(datasetRef, primaryURI)
+            for name, uri in componentURIs.items():
+                tables[datasetRef.datasetType.componentTypeName(name)].add(datasetRef, uri)
+
+    return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]