Skip to content

Commit

Permalink
Merge pull request #1355 from nextstrain/james/vcf
Browse files Browse the repository at this point in the history
Fix VCF parsing bugs
  • Loading branch information
jameshadfield committed Jan 22, 2024
2 parents 9cd915a + aa0b7d8 commit 957e42e
Show file tree
Hide file tree
Showing 33 changed files with 897 additions and 85,008 deletions.
25 changes: 24 additions & 1 deletion CHANGES.md
Expand Up @@ -4,11 +4,17 @@

### Major Changes

* ancestral, translate: GenBank files now require the (GFF mandatory) source feature to be present.[#1351][] (@jameshadfield)
* ancestral, translate: For VCF inputs please ensure you are using TreeTime 0.11.2 or later. A large number of bugfixes and improvements have been added in both Augur and TreeTime. [#1355][] and [TreeTime #263][] (@jameshadfield)
* ancestral, translate: GenBank files now require the (GFF mandatory) source feature to be present. [#1351][] (@jameshadfield)
* ancestral, translate: For GFF files, we extract the genome/sequence coordinates by inspecting the sequence-region pragma, region type and/or source type. This information is now required. [#1351][] (@jameshadfield)

### Features

* ancestral, translate: Improvements to VCF inputs / outputs. [#1355][] and [TreeTime #263][] (@jameshadfield)
* Output VCF will better match the input VCF, including CHROM name and ploidy encoding.
* VCF inputs now require `--vcf-reference-output`
* AA sequences are now exported for the tree root
* VCF writing is now 3 orders of magnitude faster (dataset dependent)
* Added a new file [DEPRECATED.md](./DEPRECATED.md) to document timelines and progress of deprecated features in the Augur CLI and Python API. [#1371][] (@victorlin)
* ancestral, translate: A range of improvements to how we parse GFF and GenBank reference files. [#1351][] (@jameshadfield)
* translate will now always export a 'nuc' annotation in the output JSON, allowing it to pass validation
Expand All @@ -24,6 +30,19 @@

### Bug Fixes

* ancestral, translate: Various fixes to VCF inputs / outputs. [#1355][] and [TreeTime #263][] (@jameshadfield)
* Fix incorrect (but passing) tests
* Fix case-sensitive sequence comparisons between the root and reference sequences.
* Fix a bug where ambiguous alleles are not inferred (see [#1380][] for full details).
* Fix a bug where positions with no sequence information were assigned a base because the mask was not being computed (see [#1382][] for full details).
* More than one ALT allele is now correctly parsed
* Mutations followed by an insertion are now parsed
* Unchanged ref genotypes are now encoded as '0' rather than '.'
* ALT alleles "*" are now valid (introduced in VCF spec 4.2, but observed in VCF 4.1 files)
* Positions with no variation are no longer exported
* ancestral, translate: Fixes for JSON (non-VCF) inputs. [#1355][] (@jameshadfield)
* The "reference" translations are now from the provided reference sequence, not from the root of the tree. [#1355][] (@jameshadfield)
* Fix a bug where positions with no sequence information were assigned a base because the mask was not applied (see [#1382][] for full details)
* translate: The 'source' ID for GFF files is now ignored as a potential gene feature (it is still used for overall nuc coords). [#1348][] (@jameshadfield)
* translate: Improvements to command line arguments. [#1348][] (@jameshadfield)
* `--tree` and `--ancestral-sequences` are now required arguments.
Expand All @@ -46,7 +65,11 @@
[#1379]: https://github.com/nextstrain/augur/pull/1379
[#1352]: https://github.com/nextstrain/augur/pull/1352
[#1353]: https://github.com/nextstrain/augur/pull/1353
[#1355]: https://github.com/nextstrain/augur/pull/1355
[#1380]: https://github.com/nextstrain/augur/issues/1380
[#1382]: https://github.com/nextstrain/augur/issues/1382
[#1387]: https://github.com/nextstrain/augur/pull/1387
[TreeTime #263]: https://github.com/neherlab/treetime/pull/263

## 23.1.1 (7 November 2023)

Expand Down
192 changes: 146 additions & 46 deletions augur/ancestral.py
Expand Up @@ -29,6 +29,7 @@
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
from .io.vcf import is_vcf as is_filename_vcf
from treetime.vcf_utils import read_vcf, write_vcf
from collections import defaultdict

Expand Down Expand Up @@ -84,63 +85,150 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,

return tt

def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, character_map=None, is_vcf=False):
def create_mask(is_vcf, tt, reference_sequence, aln):
"""
Identify sites for which every terminal sequence is ambiguous. These sites
will be masked to prevent rounding errors in the maximum likelihood
inference from assigning an arbitrary nucleotide to sites at internal nodes.
Parameters
----------
is_vcf : bool
tt : treetime.TreeTime
instance of treetime with valid ancestral reconstruction. Unused if is_vcf.
reference_sequence : str
only used if is_vcf
aln : dict
describes variation (relative to reference) per sample. Only used if is_vcf.
Returns
-------
numpy.ndarray(bool)
"""
num_tips = len(tt.tree.get_terminals())
ambiguous_count = np.zeros(tt.sequence_length, dtype=int)
if is_vcf:
variable_sites = set()
# VCF ambiguous positions come in two forms:
# Firstly, if VCF defines a "N" ALT and assigns it to every sample:
for sample_data in aln.values():
ambig_positions = []
for pos, alt in sample_data.items():
variable_sites.add(pos)
if alt=='N':
ambig_positions.append(pos)
# ambig_positions = [pos for pos, alt in sample_data.items() if alt=='N']
np.add.at(ambiguous_count, ambig_positions, 1)
# Secondly, if the VCF defines no mutations but the ref is "N":
no_info_sites = np.array(list(reference_sequence)) == 'N'
no_info_sites[list(variable_sites)] = False
# and the mask is the union of these two forms
mask = ambiguous_count==num_tips
mask[no_info_sites] = True
else:
for n in tt.tree.get_terminals():
ambiguous_count += np.array(tt.sequence(n,reconstructed=False, as_string=False)==tt.gtr.ambiguous, dtype=int)
mask = ambiguous_count==num_tips
return mask

def collect_mutations(tt, mask, character_map=None, reference_sequence=None, infer_ambiguous=False):
"""iterates of the tree and produces dictionaries with
mutations and sequences for each node.
If a reference sequence is provided then mutations can be collected for the
root node. Masked positions at the root-node will be treated specially: if
we infer ambiguity, then we report no mutations (i.e. we assume the
reference base holds), otherwise we'll report a mutation from the <ref> to
"N".
Parameters
----------
tt : treetime.TreeTime
instance of treetime with valid ancestral reconstruction
infer_tips : bool, optional
if true, request the reconstructed tip sequences from treetime, otherwise retain input ambiguities
full_sequences : bool, optional
if true, add the full sequences
mask : numpy.ndarray(bool)
character_map : None, optional
optional dictionary to map characters to a custom set.
reference_sequence : str, optional
Returns
-------
dict
dictionary of mutations and sequences
dict -> <node_name> -> [mut, mut, ...] where mut is a string in the form
<from><1-based-pos><to>
"""

if character_map is None:
cm = lambda x:x
else:
cm = lambda x: character_map.get(x, x)

data = defaultdict(dict)
data = {}
inc = 1 # convert python numbering to start-at-1

# Note that for mutations reported across the tree we don't have to consider
# the mask, because while sites which are all Ns may have an inferred base,
# there will be no variablity and thus no mutations.
for n in tt.tree.find_clades():
data[n.name]['muts'] = [a+str(int(pos)+inc)+cm(d)
for a,pos,d in n.mutations]
data[n.name] = [a+str(int(pos)+inc)+cm(d)
for a,pos,d in n.mutations]

if reference_sequence:
data[tt.tree.root.name] = []
for pos, (root_state, tree_state) in enumerate(zip(reference_sequence, tt.sequence(tt.tree.root, reconstructed=infer_ambiguous, as_string=True))):
if mask[pos] and infer_ambiguous:
continue
if root_state != tree_state:
data[tt.tree.root.name].append(f"{root_state}{pos+1}{tree_state}")

if is_vcf:
mask = np.zeros(tt.sequence_length, dtype=bool)
else:
# Identify sites for which every terminal sequence is ambiguous.
# These sites will be masked to prevent rounding errors in the
# maximum likelihood inference from assigning an arbitrary
# nucleotide to sites at internal nodes.
ambiguous_count = np.zeros(tt.sequence_length, dtype=int)
for n in tt.tree.get_terminals():
ambiguous_count += np.array(tt.sequence(n,reconstructed=False, as_string=False)==tt.gtr.ambiguous, dtype=int)
mask = ambiguous_count==len(tt.tree.get_terminals())
return data

def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
"""
Create a full sequence for every node on the tree. Masked positions will
have the reference base if we are inferring ambiguity, or the ambiguous
character 'N'.
if full_sequences:
for n in tt.tree.find_clades():
try:
tmp = tt.sequence(n,reconstructed=infer_tips, as_string=False)
tmp[mask] = tt.gtr.ambiguous
data[n.name]['sequence'] = "".join(tmp)
except:
print("No sequence available for node ",n.name)
Parameters
----------
tt : treetime.TreeTime
instance of treetime with valid ancestral reconstruction
mask : numpy.ndarray(bool)
Mask these positions by changing them to the ambiguous nucleotide
reference_sequence : str or None
infer_ambiguous : bool, optional
if true, request the reconstructed sequences from treetime, otherwise retain input ambiguities
Returns
-------
dict
dict -> <node_name> -> sequence_string
"""
sequences = {}

return {"nodes": data, "mask": mask}
ref_mask = None

def run_ancestral(T, aln, root_sequence=None, is_vcf=False, full_sequences=False, fill_overhangs=False,
if reference_sequence and infer_ambiguous:
ref_mask = np.array(list(reference_sequence))[mask]

for n in tt.tree.find_clades():
try:
tmp = tt.sequence(n,reconstructed=infer_ambiguous, as_string=False)
if ref_mask is not None:
tmp[mask] = ref_mask
else:
tmp[mask] = tt.gtr.ambiguous
sequences[n.name] = "".join(tmp)
except:
print("No sequence available for node ",n.name)
return sequences

def run_ancestral(T, aln, reference_sequence=None, is_vcf=False, full_sequences=False, fill_overhangs=False,
infer_ambiguous=False, marginal=False, alphabet='nuc'):
tt = ancestral_sequence_inference(tree=T, aln=aln, ref=root_sequence if is_vcf else None, marginal=marginal,
"""
ancestral nucleotide reconstruction using TreeTime
"""

tt = ancestral_sequence_inference(tree=T, aln=aln, ref=reference_sequence if is_vcf else None, marginal=marginal,
fill_overhangs = fill_overhangs, alphabet=alphabet,
infer_tips = infer_ambiguous)

Expand All @@ -154,21 +242,30 @@ def run_ancestral(T, aln, root_sequence=None, is_vcf=False, full_sequences=False
character_map[x] = x
# add reference sequence to json structure. This is the sequence with
# respect to which mutations on the tree are defined.
if root_sequence:
root_seq = root_sequence
if reference_sequence:
root_seq = reference_sequence
else:
root_seq = tt.sequence(T.root, as_string=True)

mutations = collect_mutations_and_sequences(tt, full_sequences=full_sequences,
infer_tips=infer_ambiguous, character_map=character_map, is_vcf=is_vcf)
if root_sequence:
for pos, (root_state, tree_state) in enumerate(zip(root_sequence, tt.sequence(tt.tree.root, reconstructed=infer_ambiguous, as_string=True))):
if root_state != tree_state:
mutations['nodes'][tt.tree.root.name]['muts'].append(f"{root_state}{pos+1}{tree_state}")
mask = create_mask(is_vcf, tt, reference_sequence, aln)
mutations = collect_mutations(tt, mask, character_map, reference_sequence, infer_ambiguous)
sequences = {}
if full_sequences:
sequences = collect_sequences(tt, mask, reference_sequence, infer_ambiguous)

# Combine the mutations & sequences into a single dict which downstream code
# expects
nodes = defaultdict(dict)
for n in tt.tree.find_clades():
name = n.name
if name in mutations:
nodes[name]['muts'] = mutations[name]
if name in sequences:
nodes[name]['sequence'] = sequences[name]

return {'tt': tt,
'root_seq': root_seq,
'mutations': mutations}
'mutations': {"nodes": nodes, "mask": mask}}


def register_parser(parent_subparsers):
Expand Down Expand Up @@ -255,7 +352,7 @@ def validate_arguments(args, is_vcf):

def run(args):
# check alignment type, set flags, read in if VCF
is_vcf = any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
is_vcf = is_filename_vcf(args.alignment)
ref = None
validate_arguments(args, is_vcf)

Expand Down Expand Up @@ -283,13 +380,14 @@ def run(args):
compress_seq = read_vcf(args.alignment, args.vcf_reference)
aln = compress_seq['sequences']
ref = compress_seq['reference']
vcf_metadata = compress_seq['metadata']
else:
aln = args.alignment
ref = None
if args.root_sequence:
for fmt in ['fasta', 'genbank']:
try:
ref = str(SeqIO.read(args.root_sequence, fmt).seq)
ref = str(SeqIO.read(args.root_sequence, fmt).seq).upper()
break
except:
pass
Expand All @@ -308,7 +406,7 @@ def run(args):
# we keep them.
infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous
full_sequences = not is_vcf
nuc_result = run_ancestral(T, aln, root_sequence=ref if ref else None, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
nuc_result = run_ancestral(T, aln, reference_sequence=ref if ref else None, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc')
anc_seqs = nuc_result['mutations']
anc_seqs['reference'] = {'nuc': nuc_result['root_seq']}
Expand Down Expand Up @@ -336,9 +434,9 @@ def run(args):
print(f"Processing gene: {gene}")
fname = args.translations.replace("%GENE", gene)
feat = features[gene]
root_seq = str(feat.extract(Seq(ref)).translate()) if ref else None
reference_sequence = str(feat.extract(Seq(ref)).translate()) if ref else None

aa_result = run_ancestral(T, fname, root_sequence=root_seq, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
aa_result = run_ancestral(T, fname, reference_sequence=reference_sequence, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa')
if aa_result['tt'].data.full_length*3 != len(feat):
raise AugurError(f"length of translated alignment for {gene} does not match length of reference feature."
Expand Down Expand Up @@ -387,7 +485,9 @@ def run(args):
# output VCF including new ancestral seqs
if args.output_vcf:
assert is_vcf
write_vcf(nuc_result['tt'].get_tree_dict(keep_var_ambigs=True), args.output_vcf)
tree_dict = nuc_result['tt'].get_tree_dict(keep_var_ambigs=not infer_ambiguous)
tree_dict['metadata'] = vcf_metadata
write_vcf(tree_dict, args.output_vcf, anc_seqs['mask'])
print("Mutations, including for ancestral nodes, exported as VCF to", args.output_vcf, file=sys.stdout)

return 0

0 comments on commit 957e42e

Please sign in to comment.