Skip to content

Commit

Permalink
Merge branch 'tickets/DM-26685' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
n8pease committed Oct 21, 2020
2 parents c8cc49c + 70029ae commit b5a4b20
Show file tree
Hide file tree
Showing 5 changed files with 564 additions and 12 deletions.
5 changes: 4 additions & 1 deletion python/lsst/daf/butler/cli/cmd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
"prune_collection",
"query_collections",
"query_dataset_types",
"remove_dataset_type")
"query_datasets",
"remove_dataset_type",
)


from .commands import (butler_import,
Expand All @@ -36,5 +38,6 @@
prune_collection,
query_collections,
query_dataset_types,
query_datasets,
remove_dataset_type,
)
56 changes: 45 additions & 11 deletions python/lsst/daf/butler/cli/cmd/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
from ..opt import (collection_type_option, dataset_type_option, directory_argument, options_file_option,
glob_argument, repo_argument, transfer_option, verbose_option)
from ..utils import cli_handle_exception, split_commas, typeStrAcceptsMultiple, unwrap
from ...script import (butlerImport, createRepo, configDump, configValidate, pruneCollection,
queryCollections, queryDatasetTypes, removeDatasetType)
from ... import script


willCreateRepoHelp = "REPO is the URI or path to the new repository. Will be created if it does not exist."
existingRepoHelp = "REPO is the URI or path to an existing data repository root or configuration file."
Expand All @@ -53,7 +53,7 @@
@options_file_option()
def butler_import(*args, **kwargs):
"""Import data into a butler repository."""
cli_handle_exception(butlerImport, *args, **kwargs)
cli_handle_exception(script.butlerImport, *args, **kwargs)


@click.command()
Expand All @@ -68,7 +68,7 @@ def butler_import(*args, **kwargs):
@options_file_option()
def create(*args, **kwargs):
"""Create an empty Gen3 Butler repository."""
cli_handle_exception(createRepo, *args, **kwargs)
cli_handle_exception(script.createRepo, *args, **kwargs)


@click.command(short_help="Dump butler config to stdout.")
Expand All @@ -85,7 +85,7 @@ def create(*args, **kwargs):
@options_file_option()
def config_dump(*args, **kwargs):
"""Dump either a subset or full Butler configuration to standard output."""
cli_handle_exception(configDump, *args, **kwargs)
cli_handle_exception(script.configDump, *args, **kwargs)


@click.command(short_help="Validate the configuration files.")
Expand All @@ -98,7 +98,7 @@ def config_dump(*args, **kwargs):
@options_file_option()
def config_validate(*args, **kwargs):
"""Validate the configuration files for a Gen3 Butler repository."""
is_good = cli_handle_exception(configValidate, *args, **kwargs)
is_good = cli_handle_exception(script.configValidate, *args, **kwargs)
if not is_good:
raise click.exceptions.Exit(1)

Expand All @@ -121,7 +121,7 @@ def config_validate(*args, **kwargs):
@options_file_option()
def prune_collection(**kwargs):
"""Remove a collection and possibly prune datasets within it."""
cli_handle_exception(pruneCollection, **kwargs)
cli_handle_exception(script.pruneCollection, **kwargs)


@click.command(short_help="Search for collections.")
Expand All @@ -141,7 +141,7 @@ def prune_collection(**kwargs):
@options_file_option()
def query_collections(*args, **kwargs):
"""Get the collections whose names match an expression."""
print(yaml.dump(cli_handle_exception(queryCollections, *args, **kwargs)))
print(yaml.dump(cli_handle_exception(script.queryCollections, *args, **kwargs)))


@click.command()
Expand All @@ -159,12 +159,46 @@ def query_collections(*args, **kwargs):
@options_file_option()
def query_dataset_types(*args, **kwargs):
"""Get the dataset types in a repository."""
print(yaml.dump(cli_handle_exception(queryDatasetTypes, *args, **kwargs), sort_keys=False))
print(yaml.dump(cli_handle_exception(script.queryDatasetTypes, *args, **kwargs), sort_keys=False))


@click.command()
@repo_argument(required=True)
@click.argument('dataset-type-name', nargs=1)
def remove_dataset_type(*args, **kwargs):
"""Remove a dataset type definition from a reopsitory."""
cli_handle_exception(removeDatasetType, *args, **kwargs)
"""Remove a dataset type definition from a repository."""
cli_handle_exception(script.removeDatasetType, *args, **kwargs)


@click.command()
@repo_argument(required=True)
@glob_argument(help="GLOB is one or more glob-style expressions that fully or partially identify the "
"dataset types to be queried.")
@click.option("--collections",
help=unwrap("""One or more expressions that fully or partially identify the collections to
search for datasets.If not provided all datasets are returned."""),
multiple=True,
metavar=typeStrAcceptsMultiple,
callback=split_commas)
@click.option("--where",
help=unwrap("""A string expression similar to a SQL WHERE clause. May involve any column of a
dimension table or a dimension name as a shortcut for the primary key column of a
dimension table."""))
@click.option("--find-first",
is_flag=True,
help=unwrap("""For each result data ID, only yield one DatasetRef of each DatasetType, from the
first collection in which a dataset of that dataset type appears (according to the
order of 'collections' passed in). If used, 'collections' must specify at least one
expression and must not contain wildcards."""))
@click.option("--show-uri",
is_flag=True,
help="Show the dataset URI in results.")
@options_file_option()
def query_datasets(**kwargs):
"""List the datasets in a repository."""
tables = cli_handle_exception(script.queryDatasets, **kwargs)

for table in tables:
print("")
table.pprint_all()
print("")
1 change: 1 addition & 0 deletions python/lsst/daf/butler/script/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,6 @@
from .configValidate import configValidate
from .pruneCollection import pruneCollection
from .queryCollections import queryCollections
from .queryDatasets import queryDatasets
from .queryDatasetTypes import queryDatasetTypes
from .removeDatasetType import removeDatasetType
158 changes: 158 additions & 0 deletions python/lsst/daf/butler/script/queryDatasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from astropy.table import Table as AstropyTable
from collections import defaultdict, namedtuple
from numpy import array

from .. import Butler
from ..core.utils import globToRegex

_RefInfo = namedtuple("RefInfo", "datasetRef uri")


class _Table:
"""Aggregates rows for a single dataset type, and creates an astropy table
with the aggregated data. Eliminates duplicate rows.
Parameters
----------
columnNames : `list` [`str`]
The names of columns.
"""

def __init__(self):
self.datasetRefs = set()

def add(self, datasetRef, uri=None):
"""Add a row of information to the table.
``uri`` is optional but must be the consistent; provided or not, for
every call to a ``_Table`` instance.
Parameters
----------
datasetRef : ``DatasetRef``
A dataset ref that will be added as a row in the table.
uri : ``ButlerURI``, optional
The URI to show as a file location in the table, by default None
"""
if uri:
uri = str(uri)
self.datasetRefs.add(_RefInfo(datasetRef, uri))

def getAstropyTable(self, datasetTypeName):
"""Get the table as an astropy table.
Parameters
----------
datasetTypeName : `str`
The dataset type name to show in the ``type`` column of the table.
Returns
-------
table : `astropy.table._Table`
The table with the provided column names and rows.
"""
# Should never happen; adding a dataset should be the action that
# causes a _Table to be created.
if not self.datasetRefs:
raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}")

refInfo = next(iter(self.datasetRefs))
columnNames = ["type", "run", "id",
*[str(item) for item in refInfo.datasetRef.dataId.keys()]]
if refInfo.uri:
columnNames.append("URI")

rows = []
for refInfo in sorted(self.datasetRefs):
row = [datasetTypeName,
refInfo.datasetRef.run,
refInfo.datasetRef.id,
*[str(value) for value in refInfo.datasetRef.dataId.values()]]
if refInfo.uri:
row.append(refInfo.uri)
rows.append(row)

return AstropyTable(array(rows), names=columnNames)


def queryDatasets(repo, glob, collections, where, find_first, show_uri):
"""Get dataset refs from a repository.
Parameters
----------
repo : `str`
URI to the location of the repo or URI to a config file describing the
repo and its location.
glob : iterable [`str`]
A list of glob-style search string that fully or partially identify
the dataset type names to search for.
collections : iterable [`str`]
A list of glob-style search string that fully or partially identify
the collections to search for.
where : `str`
A string expression similar to a SQL WHERE clause. May involve any
column of a dimension table or (as a shortcut for the primary key
column of a dimension table) dimension name.
find_first : `bool`
For each result data ID, only yield one DatasetRef of each DatasetType,
from the first collection in which a dataset of that dataset type
appears (according to the order of `collections` passed in). If used,
`collections` must specify at least one expression and must not contain
wildcards.
show_uri : `bool`
If True, include the dataset URI in the output.
Returns
-------
datasetTables : `list` [``astropy.table._Table``]
A list of astropy tables, one for each dataset type.
"""
butler = Butler(repo)

dataset = globToRegex(glob)
if not dataset:
dataset = ...

if collections and not find_first:
collections = globToRegex(collections)
elif not collections:
collections = ...

datasets = butler.registry.queryDatasets(datasetType=dataset,
collections=collections,
where=where,
deduplicate=find_first)

tables = defaultdict(_Table)

for datasetRef in datasets:
if not show_uri:
tables[datasetRef.datasetType.name].add(datasetRef)
else:
primaryURI, componentURIs = butler.getURIs(datasetRef, collections=datasetRef.run)
if primaryURI:
tables[datasetRef.datasetType.name].add(datasetRef, primaryURI)
for name, uri in componentURIs.items():
tables[datasetRef.datasetType.componentTypeName(name)].add(datasetRef, uri)

return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]

0 comments on commit b5a4b20

Please sign in to comment.