Skip to content

Commit

Permalink
Write compound (segmented) sequence locations
Browse files Browse the repository at this point in the history
Given a SeqFeqture with a CompoundLocation we now correctly write out
the CDS/gene using segmented coordinates. Auspice can now handle such
coordinates (see <nextstrain/auspice#1684> and
<#1281> for the corresponding
schema updates).

Note that the translations (via augur translate) of complex CDSs did not
need modifying as they already used BioPython's SeqFeature.extract
method.

Supersedes #1333
  • Loading branch information
jameshadfield committed Mar 16, 2024
1 parent 4dd23c5 commit e099711
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 28 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
* filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin)
* export: Fixes bug [#1433] introduced in v23.1.0, that causes validation to fail when gene names start with `nuc`, e.g. `nucleocapsid`. [#1434][] (@corneliusroemer)
* import: Fixes bug introduced in v24.2.0 that prevented `import beast` from running. [#1439][] (@tomkinsc)
* translate, ancestral: Compound CDS are now exported as segmented CDS and are now viewable in Auspice. [#1438][] (@jameshadfield)

[#1425]: https://github.com/nextstrain/augur/pull/1425
[#1429]: https://github.com/nextstrain/augur/pull/1429
[#1433]: https://github.com/nextstrain/augur/issues/1433
[#1434]: https://github.com/nextstrain/augur/pull/1434
[#1436]: https://github.com/nextstrain/augur/pull/1436
[#1438]: https://github.com/nextstrain/augur/pull/1438
[#1439]: https://github.com/nextstrain/augur/pull/1439

## 24.2.3 (23 February 2024)
Expand Down
12 changes: 3 additions & 9 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name, \
genome_features_to_auspice_annotation
from .io.file import open_file
from .io.vcf import is_vcf as is_filename_vcf
from treetime.vcf_utils import read_vcf, write_vcf
Expand Down Expand Up @@ -455,14 +456,7 @@ def run(args):
node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)

anc_seqs['reference'][gene] = aa_result['root_seq']
# FIXME: Note that this is calculating the end of the CDS as 3*length of translation
# this is equivalent to the annotation for single segment CDS, but not for cds
# with splicing and slippage. But auspice can't handle the latter at the moment.
anc_seqs['annotations'][gene] = {'seqid':args.annotation,
'type':feat.type,
'start': int(feat.location.start)+1,
'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
anc_seqs['annotations'].update(genome_features_to_auspice_annotation({gene: feat}, args.annotation))

# Save ancestral amino acid sequences to FASTA.
if args.output_translations:
Expand Down
22 changes: 3 additions & 19 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
import numpy as np
from Bio import SeqIO, Seq, SeqRecord, Phylo
from .io.vcf import write_VCF_translation, is_vcf as is_filename_vcf
from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name
from .utils import parse_genes_argument, read_node_data, load_features, \
write_json, get_json_name, genome_features_to_auspice_annotation
from treetime.vcf_utils import read_vcf
from augur.errors import AugurError
from textwrap import dedent
Expand Down Expand Up @@ -446,24 +447,7 @@ def run(args):
continue
reference_translations[fname] = safe_translate(str(feat.extract(reference)))

## glob the annotations for later auspice export
#
# Note that BioPython FeatureLocations use
# "Pythonic" coordinates: [zero-origin, half-open)
# Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
annotations = {
'nuc': {'start': features['nuc'].location.start+1,
'end': features['nuc'].location.end,
'strand': '+',
'type': features['nuc'].type, # (unused by auspice)
'seqid': args.reference_sequence} # (unused by auspice)
}
for fname, feat in features.items():
annotations[fname] = {'seqid':args.reference_sequence,
'type':feat.type,
'start':int(feat.location.start)+1,
'end':int(feat.location.end),
'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
annotations = genome_features_to_auspice_annotation(features, args.reference_sequence, assert_nuc=True)

## determine amino acid mutations for each node
try:
Expand Down
54 changes: 54 additions & 0 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,3 +832,57 @@ def _get_genes_from_file(fname):
print("Read in {} specified genes to translate.".format(len(unique_genes)))

return unique_genes



def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nuc=False):
"""
Parameters
----------
features : dict
keys: feature names, values: Bio.SeqFeature.SeqFeature objects
ref_seq_name : str (optional)
Exported as the `seqid` for each feature. Note this is unused by Auspice
assert_nuc : bool (optional)
If true, one of the feature key names must be "nuc"
Returns
-------
annotations: dict
See schema-annotations.json for the schema this conforms to
"""
from Bio.SeqFeature import SimpleLocation, CompoundLocation

if assert_nuc and 'nuc' not in features:
raise AugurError("Genome features must include a feature for 'nuc'")

def _parse(feat):
a = {}
# Note that BioPython locations use "Pythonic" coordinates: [zero-origin, half-open)
# Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
if type(feat.location)==SimpleLocation:
a['start'] = int(feat.location.start)+1
a['end'] = int(feat.location.end)
elif type(feat.location)==CompoundLocation:
a['segments'] = [
{'start':int(segment.start)+1, 'end':int(segment.end)}
for segment in feat.location.parts # segment: SimpleLocation
]
else:
raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}")
a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]
a['type'] = feat.type # (unused by auspice)
if ref_seq_name:
a['seqid'] = ref_seq_name # (unused by auspice)
return a

annotations = {}
for fname, feat in features.items():
annotations[fname] = _parse(feat)
if fname=='nuc':
assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand"
elif annotations[fname]['strand'] not in ['+', '-']:
print("WARNING: Feature {fname:q} uses a strand which auspice cannot display")

return annotations

0 comments on commit e099711

Please sign in to comment.