Write compound (segmented) sequence locations

Given a SeqFeqture with a CompoundLocation we now correctly write out the CDS/gene using segmented coordinates. Auspice can now handle such coordinates (see <nextstrain/auspice#1684> and <#1281> for the corresponding schema updates). Note that the translations (via augur translate) of complex CDSs did not need modifying as they already used BioPython's SeqFeature.extract method. Supersedes #1333
nextstrain · Mar 16, 2024 · e099711 · e099711
1 parent 4dd23c5
commit e099711
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 28 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -12,12 +12,14 @@
 * filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin)
 * export: Fixes bug [#1433] introduced in v23.1.0, that causes validation to fail when gene names start with `nuc`, e.g. `nucleocapsid`. [#1434][] (@corneliusroemer)
 * import: Fixes bug introduced in v24.2.0 that prevented `import beast` from running. [#1439][] (@tomkinsc)
+* translate, ancestral: Compound CDS are now exported as segmented CDS and are now viewable in Auspice.  [#1438][] (@jameshadfield)
 
 [#1425]: https://github.com/nextstrain/augur/pull/1425
 [#1429]: https://github.com/nextstrain/augur/pull/1429
 [#1433]: https://github.com/nextstrain/augur/issues/1433
 [#1434]: https://github.com/nextstrain/augur/pull/1434
 [#1436]: https://github.com/nextstrain/augur/pull/1436
+[#1438]: https://github.com/nextstrain/augur/pull/1438
 [#1439]: https://github.com/nextstrain/augur/pull/1439
 
 ## 24.2.3 (23 February 2024)

diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -28,7 +28,8 @@
 from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
-from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
+from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name, \
+    genome_features_to_auspice_annotation
 from .io.file import open_file
 from .io.vcf import is_vcf as is_filename_vcf
 from treetime.vcf_utils import read_vcf, write_vcf
@@ -455,14 +456,7 @@ def run(args):
                     node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)
 
             anc_seqs['reference'][gene] = aa_result['root_seq']
-            # FIXME: Note that this is calculating the end of the CDS as 3*length of translation
-            # this is equivalent to the annotation for single segment CDS, but not for cds
-            # with splicing and slippage. But auspice can't handle the latter at the moment.
-            anc_seqs['annotations'][gene] = {'seqid':args.annotation,
-                                             'type':feat.type,
-                                             'start': int(feat.location.start)+1,
-                                             'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
-                                             'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
+            anc_seqs['annotations'].update(genome_features_to_auspice_annotation({gene: feat}, args.annotation))
 
             # Save ancestral amino acid sequences to FASTA.
             if args.output_translations:

diff --git a/augur/translate.py b/augur/translate.py
@@ -17,7 +17,8 @@
 import numpy as np
 from Bio import SeqIO, Seq, SeqRecord, Phylo
 from .io.vcf import write_VCF_translation, is_vcf as is_filename_vcf
-from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name
+from .utils import parse_genes_argument, read_node_data, load_features, \
+    write_json, get_json_name, genome_features_to_auspice_annotation
 from treetime.vcf_utils import read_vcf
 from augur.errors import AugurError
 from textwrap import dedent
@@ -446,24 +447,7 @@ def run(args):
                 continue
             reference_translations[fname] = safe_translate(str(feat.extract(reference)))
 
-    ## glob the annotations for later auspice export
-    #
-    # Note that BioPython FeatureLocations use
-    # "Pythonic" coordinates: [zero-origin, half-open)
-    # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
-    annotations = {
-        'nuc': {'start': features['nuc'].location.start+1,
-                'end':   features['nuc'].location.end,
-                'strand': '+',
-                'type':  features['nuc'].type,     # (unused by auspice)
-                'seqid': args.reference_sequence}  # (unused by auspice)
-    }
-    for fname, feat in features.items():
-        annotations[fname] = {'seqid':args.reference_sequence,
-                              'type':feat.type,
-                              'start':int(feat.location.start)+1,
-                              'end':int(feat.location.end),
-                              'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
+    annotations = genome_features_to_auspice_annotation(features, args.reference_sequence, assert_nuc=True)
 
     ## determine amino acid mutations for each node
     try:

diff --git a/augur/utils.py b/augur/utils.py
@@ -832,3 +832,57 @@ def _get_genes_from_file(fname):
     print("Read in {} specified genes to translate.".format(len(unique_genes)))
 
     return unique_genes
+
+
+
+def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nuc=False):
+    """
+    Parameters
+    ----------
+    features : dict
+        keys: feature names, values: Bio.SeqFeature.SeqFeature objects
+    ref_seq_name : str (optional)
+        Exported as the `seqid` for each feature. Note this is unused by Auspice
+    assert_nuc : bool (optional)
+        If true, one of the feature key names must be "nuc"
+
+    Returns
+    -------
+    annotations: dict
+        See schema-annotations.json for the schema this conforms to
+
+    """
+    from Bio.SeqFeature import SimpleLocation, CompoundLocation
+
+    if assert_nuc and 'nuc' not in features:
+        raise AugurError("Genome features must include a feature for 'nuc'")
+
+    def _parse(feat):
+        a = {}
+        # Note that BioPython locations use "Pythonic" coordinates: [zero-origin, half-open)
+        # Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
+        if type(feat.location)==SimpleLocation:
+            a['start'] = int(feat.location.start)+1
+            a['end'] = int(feat.location.end)
+        elif type(feat.location)==CompoundLocation:
+            a['segments'] = [
+                {'start':int(segment.start)+1, 'end':int(segment.end)}
+                for segment in feat.location.parts # segment: SimpleLocation
+            ]
+        else:
+            raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}")
+        a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]
+        a['type'] = feat.type  # (unused by auspice)
+        if ref_seq_name:
+            a['seqid'] = ref_seq_name # (unused by auspice)
+        return a
+
+    annotations = {}
+    for fname, feat in features.items():
+        annotations[fname] = _parse(feat)
+        if fname=='nuc':
+            assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand"
+        elif annotations[fname]['strand'] not in ['+', '-']:
+            print("WARNING: Feature {fname:q} uses a strand which auspice cannot display")
+
+    return annotations