Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[augur ancestral] arg parsing improvements #1344

Merged
merged 4 commits into from
Dec 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

## __NEXT__

### Features

* ancestral: For VCF alignments, a VCF output file is now only created when requested via `--output-vcf`. [#1344][] (@jameshadfield)
* ancestral: Improvements to command line arguments. [#1344][] (@jameshadfield)
* Incompatible arguments are now checked, especially related to VCF vs FASTA inputs.
* `--vcf-reference` and `--root-sequence` are now mutually exclusive.

[#1344]: https://github.com/nextstrain/augur/pull/1344

## 23.1.1 (7 November 2023)

Expand Down
86 changes: 48 additions & 38 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,13 @@ def register_parser(parent_subparsers):
)
input_group.add_argument('--tree', '-t', required=True, help="prebuilt Newick")
input_group.add_argument('--alignment', '-a', help="alignment in FASTA or VCF format")
input_group.add_argument('--vcf-reference', type=str, help='FASTA file of the sequence the VCF was mapped to (only used if a VCF is provided as the alignment)')
input_group.add_argument('--root-sequence', type=str, help='FASTA/genbank file of the sequence that is used as root for mutation calling.'
' Differences between this sequence and the inferred root will be reported as mutations on the root branch.')
input_group_ref = input_group.add_mutually_exclusive_group()
input_group_ref.add_argument('--vcf-reference', type=str, metavar='FASTA',
help='[VCF alignment only] file of the sequence the VCF was mapped to.'
' Differences between this sequence and the inferred root will be reported as mutations on the root branch.')
input_group_ref.add_argument('--root-sequence', type=str,metavar='FASTA/GenBank',
help='[FASTA alignment only] file of the sequence that is used as root for mutation calling.'
' Differences between this sequence and the inferred root will be reported as mutations on the root branch.')

global_options_group = parser.add_argument_group(
"global options",
Expand Down Expand Up @@ -228,21 +232,40 @@ def register_parser(parent_subparsers):

return parser

def run(args):
# Validate arguments.
def validate_arguments(args, is_vcf):
"""
Check that provided arguments are compatible.
Where possible we use argparse built-ins, but they don't cover everything we want to check.
This checking shouldn't be used by downstream code to assume arguments exist, however by checking for
invalid combinations up-front we can exit quickly.
"""
aa_arguments = (args.annotation, args.genes, args.translations)
if any(aa_arguments) and not all(aa_arguments):
raise AugurError("For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.")

if args.output_sequences and args.output_vcf:
raise AugurError("Both sequence (fasta) and VCF output have been requested, but these are incompatible.")

if is_vcf and args.output_sequences:
raise AugurError("Sequence (fasta) output has been requested but the input alignment is VCF.")

if not is_vcf and args.output_vcf:
raise AugurError("VCF output has been requested but the input alignment is not VCF.")


def run(args):
# check alignment type, set flags, read in if VCF
is_vcf = any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
ref = None
validate_arguments(args, is_vcf)

try:
T = read_tree(args.tree)
except (FileNotFoundError, InvalidTreeError) as error:
print("ERROR: %s" % error, file=sys.stderr)
return 1
except FileNotFoundError:
raise AugurError(f"The provided tree file {args.tree!r} doesn't exist")
except InvalidTreeError as error:
raise AugurError(error)
# Note that a number of other errors may be thrown by `read_tree` such as Bio.Phylo.NewickIO.NewickError

import numpy as np
missing_internal_node_names = [n.name is None for n in T.get_nonterminals()]
Expand All @@ -256,9 +279,7 @@ def run(args):

if is_vcf:
if not args.vcf_reference:
print("ERROR: a reference Fasta is required with VCF-format alignments", file=sys.stderr)
return 1

raise AugurError("a reference Fasta is required with VCF-format alignments")
compress_seq = read_vcf(args.alignment, args.vcf_reference)
aln = compress_seq['sequences']
ref = compress_seq['reference']
Expand All @@ -273,8 +294,7 @@ def run(args):
except:
pass
if ref is None:
print(f"ERROR: could not read root sequence from {args.root_sequence}", file=sys.stderr)
return 1
raise AugurError(f"could not read root sequence from {args.root_sequence}")

import treetime
print("\nInferred ancestral sequence states using TreeTime:"
Expand Down Expand Up @@ -304,8 +324,7 @@ def run(args):
## load features; only requested features if genes given
features = load_features(args.annotation, args.genes)
if features is None:
print("ERROR: could not read features of reference sequence file")
return 1
raise AugurError("could not read features of reference sequence file")
print("Read in {} features from reference sequence file".format(len(features)))
for gene in args.genes:
print(f"Processing gene: {gene}")
Expand All @@ -316,9 +335,8 @@ def run(args):
aa_result = run_ancestral(T, fname, root_sequence=root_seq, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa')
if aa_result['tt'].data.full_length*3 != len(feat):
print(f"ERROR: length of translated alignment for {gene} does not match length of reference feature."
raise AugurError(f"length of translated alignment for {gene} does not match length of reference feature."
" Please make sure that the annotation matches the translations.")
return 1

for key, node in anc_seqs['nodes'].items():
if 'aa_muts' not in node: node['aa_muts'] = {}
Expand Down Expand Up @@ -352,26 +370,18 @@ def run(args):
print("ancestral mutations written to", out_name, file=sys.stdout)

if args.output_sequences:
if args.output_vcf:
# TODO: This should be an error and we should check for this
# unsupported combination of arguments at the beginning of the
# script to avoid wasting time for users.
print("WARNING: augur only supports sequence output for FASTA alignments and not for VCFs.", file=sys.stderr)
else:
records = [
SeqRecord(Seq(node_data["sequence"]), id=node_name, description="")
for node_name, node_data in anc_seqs["nodes"].items()
]
SeqIO.write(records, args.output_sequences, "fasta")
print("ancestral sequences FASTA written to", args.output_sequences, file=sys.stdout)

# If VCF, output VCF including new ancestral seqs
if is_vcf:
if args.output_vcf:
vcf_fname = args.output_vcf
else:
vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
write_vcf(nuc_result['tt'].get_tree_dict(keep_var_ambigs=True), vcf_fname)
print("ancestral sequences as vcf-file written to",vcf_fname, file=sys.stdout)
assert not is_vcf
records = [
SeqRecord(Seq(node_data["sequence"]), id=node_name, description="")
for node_name, node_data in anc_seqs["nodes"].items()
]
SeqIO.write(records, args.output_sequences, "fasta")
print("ancestral sequences FASTA written to", args.output_sequences, file=sys.stdout)

# output VCF including new ancestral seqs
if args.output_vcf:
assert is_vcf
write_vcf(nuc_result['tt'].get_tree_dict(keep_var_ambigs=True), args.output_vcf)
print("Mutations, including for ancestral nodes, exported as VCF to", args.output_vcf, file=sys.stdout)

return 0

This file was deleted.

68 changes: 68 additions & 0 deletions tests/functional/ancestral/cram/invalid-args.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
Setup

$ source "$TESTDIR"/_setup.sh

Input FASTA + VCF output is not possible

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --output-vcf "output.vcf" > /dev/null
ERROR: VCF output has been requested but the input alignment is not VCF.
[2]

Input VCF + FASTA output is not possible (Note that the input file doesn't exist, but we exit before that's checked)

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/snps.vcf \
> --output-sequences "output.fasta" > /dev/null
ERROR: Sequence (fasta) output has been requested but the input alignment is VCF.
[2]

Output FASTA _and_ VCF is not possible

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --output-vcf "output.vcf" \
> --output-sequences "output.fasta" > /dev/null
ERROR: Both sequence (fasta) and VCF output have been requested, but these are incompatible.
[2]


Try to infer ancestral amino acid sequences without all required arguments.
This should fail.

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes ENV PRO \
> --output-node-data "ancestral_mutations.json" > /dev/null
ERROR: For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.
[2]

Missing tree file

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --output-sequences "output.fasta" > /dev/null
ERROR: The provided tree file .* doesn't exist (re)
[2]


Attempting to use FASTA-input reference and VCF-input reference args
(The files here don't exist, but we exit before they're checked)

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --root-sequence $TESTDIR/../data/reference.fasta \
> --vcf-reference $TESTDIR/../data/reference.fasta \
> --output-sequences "output.fasta" > /dev/null 2>"err-args.txt"
[2]

$ grep "augur ancestral: error: argument --vcf-reference: not allowed with argument --root-sequence" "err-args.txt"
augur ancestral: error: argument --vcf-reference: not allowed with argument --root-sequence