Skip to content

Commit

Permalink
Merge pull request #1240: Allow selection of metadata ID column
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Jun 14, 2023
2 parents 1e9a0e7 + 0b64f5e commit cfb5255
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 12 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## __NEXT__

### Features

* export, frequencies, refine, traits: Add a new flag `--metadata-id-columns` to customize the possible metadata ID columns. Previously, this was only available in `augur filter`. [#1240][] (@victorlin)

[#1240]: https://github.com/nextstrain/augur/pull/1240

## 22.0.3 (14 June 2023)

Expand Down
9 changes: 7 additions & 2 deletions augur/export_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from collections import defaultdict
from .errors import AugurError
from .argparse_ import ExtendAction
from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors

def convert_tree_to_json_structure(node, metadata, div=0, strains=None):
Expand Down Expand Up @@ -315,6 +315,8 @@ def add_core_args(parser):
core.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata")
core.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
core.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
core.add_argument('--node-data', required=True, nargs='+', action=ExtendAction, help="JSON files with meta data for each node")
core.add_argument('--output-tree', help="JSON file name that is passed on to auspice (e.g., zika_tree.json).")
core.add_argument('--output-meta', help="JSON file name that is passed on to auspice (e.g., zika_meta.json).")
Expand Down Expand Up @@ -368,7 +370,10 @@ def run(args):
meta_json = read_config(args.auspice_config)
ensure_config_is_v1(meta_json)
try:
meta_tsv = read_metadata(args.metadata, args.metadata_delimiters)
meta_tsv = read_metadata(
args.metadata,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
Expand Down
9 changes: 7 additions & 2 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from .argparse_ import ExtendAction
from .errors import AugurError
from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
from .types import ValidationMode
from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors
from .validate import export_v2 as validate_v2, auspice_config_v2 as validate_auspice_config_v2, ValidateError
Expand Down Expand Up @@ -852,6 +852,8 @@ def register_parser(parent_subparsers):
optional_inputs.add_argument('--metadata', metavar="FILE", help="Additional metadata for strains in the tree")
optional_inputs.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
optional_inputs.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
optional_inputs.add_argument('--colors', metavar="FILE", help="Custom color definitions, one per line in the format `TRAIT_TYPE\\tTRAIT_VALUE\\tHEX_CODE`")
optional_inputs.add_argument('--lat-longs', metavar="TSV", help="Latitudes and longitudes for geography traits (overrides built in mappings)")

Expand Down Expand Up @@ -1072,7 +1074,10 @@ def run(args):

if args.metadata is not None:
try:
metadata_file = read_metadata(args.metadata, args.metadata_delimiters).to_dict(orient="index")
metadata_file = read_metadata(
args.metadata,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns).to_dict(orient="index")
for strain in metadata_file.keys():
if "strain" not in metadata_file[strain]:
metadata_file[strain]["strain"] = strain
Expand Down
4 changes: 2 additions & 2 deletions augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def run(args):
try:
metadata_reader = read_metadata(
args.metadata,
args.metadata_delimiters,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns,
chunk_size=args.metadata_chunk_size,
)
Expand Down Expand Up @@ -317,7 +317,7 @@ def run(args):
# have passed filters.
metadata_reader = read_metadata(
args.metadata,
args.metadata_delimiters,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns,
chunk_size=args.metadata_chunk_size,
)
Expand Down
6 changes: 4 additions & 2 deletions augur/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies
from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError
from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT, get_numerical_dates
from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
from .utils import read_node_data, write_json


Expand All @@ -24,6 +24,8 @@ def register_parser(parent_subparsers):
help="metadata including dates for given samples")
parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--regions', type=str, nargs='+', default=['global'],
help="region to subsample to")
parser.add_argument("--pivot-interval", type=int, default=3,
Expand Down Expand Up @@ -84,7 +86,7 @@ def format_frequencies(freq):

def run(args):
try:
metadata = read_metadata(args.metadata, args.metadata_delimiters)
metadata = read_metadata(args.metadata, delimiters=args.metadata_delimiters, id_columns=args.metadata_id_columns)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
Expand Down
9 changes: 7 additions & 2 deletions augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from Bio import Phylo
from .dates import get_numerical_dates
from .dates.errors import InvalidYearBounds
from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
from .utils import read_tree, write_json, InvalidTreeError
from .errors import AugurError
from treetime.vcf_utils import read_vcf
Expand Down Expand Up @@ -102,6 +102,8 @@ def register_parser(parent_subparsers):
parser.add_argument('--metadata', type=str, metavar="FILE", help="sequence metadata")
parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--output-tree', type=str, help='file name to write tree to')
parser.add_argument('--output-node-data', type=str, help='file name to write branch lengths as node data')
parser.add_argument('--use-fft', action="store_true", help="produce timetree using FFT for convolutions")
Expand Down Expand Up @@ -212,7 +214,10 @@ def run(args):
print("ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr)
return 1
try:
metadata = read_metadata(args.metadata, args.metadata_delimiters)
metadata = read_metadata(
args.metadata,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
Expand Down
9 changes: 7 additions & 2 deletions augur/traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import os, sys
import pandas as pd
from .errors import AugurError
from .io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_metadata
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, InvalidDelimiter, read_metadata
from .utils import write_json, get_json_name
TINY = 1e-12

Expand Down Expand Up @@ -104,6 +104,8 @@ def register_parser(parent_subparsers):
parser.add_argument('--metadata', required=True, metavar="FILE", help="table with metadata")
parser.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+",
help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
parser.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+",
help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--weights', required=False, help="tsv/csv table with equilibrium probabilities of discrete states")
parser.add_argument('--columns', required=True, nargs='+',
help='metadata fields to perform discrete reconstruction on')
Expand All @@ -130,7 +132,10 @@ def run(args):
"""
tree_fname = args.tree
try:
traits = read_metadata(args.metadata, args.metadata_delimiters)
traits = read_metadata(
args.metadata,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
Expand Down

0 comments on commit cfb5255

Please sign in to comment.