openvax · iskandr · Feb 17, 2015 · Feb 17, 2015 · Feb 17, 2015 · tavinathanson
diff --git a/varcode/core_logic.py b/varcode/core_logic.py
@@ -37,7 +37,7 @@
     FrameShiftTruncation
 )
 
-from Bio.Seq import Seq
+from Bio.Seq import Seq, CodonTable
 from pyensembl.transcript import Transcript
 from pyensembl.biotypes import is_coding_biotype
 
@@ -83,6 +83,31 @@ def first_transcript_offset(start_pos, end_pos, transcript):
     return the start/end offsets of the variant relative to the
     chromosomal positions of the transcript.
     """
+    # ensure that start_pos:end_pos overlap with transcript positions
+    assert start_pos <= end_pos, \
+        "start_pos %d shouldn't be greater than end_pos %d" % (
+            start_pos, end_pos)
+    assert start_pos <= transcript.end, \
+        "Range %d:%d starts after transcript %s (%d:%d)" % (
+            start_pos,
+            end_pos,
+            transcript,
+            transcript.start,
+            transcript.end)
+    assert end_pos >= transcript.start, \
+        "Range %d:%d ends before transcript %s (%d:%d)" % (
+            start_pos,
+            end_pos,
+            transcript,
+            transcript.start,
+            transcript.end)
+    # trim the start position to the beginning of the transcript
+    if start_pos < transcript.start:
+        start_pos = transcript.start
+    # trim the end position to the end of the transcript
+    if end_pos > transcript.end:
+        end_pos = transcript.end
+
     # offsets into the spliced transcript
     offsets = [
         transcript.spliced_offset(start_pos),
@@ -157,7 +182,15 @@ def infer_coding_effect(
     # to amino acids. For the original CDS make sure that it starts with
     # a start codon and ends with a stop codon. Don't include the stop codon
     # in the translated sequence.
-    original_protein = str(Seq(cds_seq).translate(cds=True, to_stop=True))
+    try:
+        original_protein = str(Seq(cds_seq).translate(cds=True, to_stop=True))
+    except CodonTable.TranslationError as error:
+        # coding sequence may contain premature stop codon or may have
+        # an incorrectly annotated frame
+        logging.warning(
+            "Translation error in coding sequence for %s" % transcript)
+        logging.warning(error)
+        return IncompleteTranscript(variant, transcript)
 
     assert len(original_protein) > 0, \
         "Translated protein sequence of %s is empty" % (transcript,)

diff --git a/varcode/effect_ordering.py b/varcode/effect_ordering.py
@@ -52,10 +52,18 @@ def top_priority_transcript_effect(effects):
         elif priority == best_priority:
             best_effects.append(effect)
 
-    def effect_cds_len(effect):
-        """
-        Returns length of coding sequence of transcript associated with effect
-        """
-        return len(effect.transcript.coding_sequence)
-
-    return max(best_effects, key=effect_cds_len)
+    if any(effect.transcript.complete for effect in best_effects):
+        # if any transcripts have complete coding sequence annotations,
+        # filter the effects down to those that are complete and sort
+        # them by length of the coding sequence
+        best_effects = [
+            effect
+            for effect in best_effects
+            if effect.transcript.complete
+        ]
+        key_fn = lambda effect: len(effect.transcript.coding_sequence)
+    else:
+        # if effects are over incomplete transcripts, sort them by the
+        # their total transcript length
+        key_fn = lambda effect: len(effect.transcript)
+    return max(best_effects, key=key_fn)
diff --git a/varcode/variant_annotator.py b/varcode/variant_annotator.py
@@ -1,4 +1,5 @@
 from __future__ import print_function, division, absolute_import
+import logging
 
 from .common import group_by
 from .core_logic import infer_transcript_effect
@@ -11,7 +12,7 @@ class VariantAnnotator(object):
     def __init__(self, ensembl_release):
         self.ensembl = pyensembl.EnsemblRelease(ensembl_release)
 
-    def describe_variant(self, variant):
+    def describe_variant(self, variant, raise_on_error=True):
         """
         Determine the effects of a variant on any transcripts it overlaps.
         Returns a VariantEffect object.
@@ -22,32 +23,55 @@ def describe_variant(self, variant):
         ref = variant.ref
         alt = variant.alt
 
-        overlapping_genes = self.ensembl.genes_at_locus(contig, pos, end_pos)
+        overlapping_transcripts = self.ensembl.transcripts_at_locus(
+            contig, pos, end_pos)
 
-        if len(overlapping_genes) == 0:
-            return [], {}
-        else:
-            overlapping_transcripts = self.ensembl.transcripts_at_locus(
-                    contig, pos, end_pos)
-
-            assert len(overlapping_transcripts) > 0, \
-                "No transcripts found for mutation %s:%d %s>%s" % (
-                    contig, pos, ref, alt)
+        if len(overlapping_transcripts) == 0:
+            # intergenic variant
+            return VariantEffect(
+                variant=variant,
+                genes=[],
+                gene_transcript_effects={},
+                errors={})
 
         # group transcripts by their gene ID
         overlapping_transcript_groups = group_by(
             overlapping_transcripts, field_name='gene_id')
 
         # dictionary from gene ID to list of transcript effects
         gene_transcript_effects_groups = {}
+
+        # mapping from Transcript objects to errors encountered
+        # while trying to annotated them
+        errors = {}
+
         for (gene_id, transcripts) in overlapping_transcript_groups.items():
             effects = []
             for transcript in transcripts:
-                effect = infer_transcript_effect(variant, transcript)
-                effects.append(effect)
+                try:
+                    effects.append(infer_transcript_effect(variant, transcript))
+                except (AssertionError, ValueError) as error:
+                    if raise_on_error:
+                        raise
+                    else:
+                        logging.warn(
+                            "Encountered error annotating %s for %s: %s",
+                            variant,
+                            transcript,
+                            error)
+                    errors[transcript] = error
             gene_transcript_effects_groups[gene_id] = effects
 
+        # construct Gene objects for all the genes containing
+        # all the transcripts this variant overlaps
+        overlapping_genes = [
+            self.ensembl.gene_by_id(gene_id)
+            for gene_id
+            in overlapping_transcript_groups.keys()
+        ]
+
         return VariantEffect(
             variant=variant,
             genes=overlapping_genes,
-            gene_transcript_effects=gene_transcript_effects_groups)
+            gene_transcript_effects=gene_transcript_effects_groups,
+            errors=errors)
diff --git a/varcode/variant_collection.py b/varcode/variant_collection.py
@@ -82,7 +82,7 @@ def __init__(self, filename, drop_duplicates=True):
         elif filename.endswith(".maf"):
             self.raw_reference_name, self.records = _load_maf(filename)
         else:
-            raise ValueErrr("Unrecognized file type: %s" % (filename,))
+            raise ValueError("Unrecognized file type: %s" % (filename,))
 
 
         self.filename = filename
@@ -117,13 +117,21 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
-    def variant_effects(self):
+    def variant_effects(self, raise_on_error=True):
         """
         Determine the impact of each variant, return a list of
         VariantEffect objects.
+
+        Parameters
+        ----------
+
+        raise_on_error : bool, optional.
+            Raise exception if error is encountered while annotating
+            transcripts, otherwise track errors in VariantEffect.errors
+            dictionary (default=True).
         """
         return [
-            self.annot.describe_variant(variant)
+            self.annot.describe_variant(variant, raise_on_error=raise_on_error)
             for variant
             in self.records
         ]

diff --git a/varcode/variant_effect.py b/varcode/variant_effect.py
@@ -15,7 +15,8 @@ def __init__(
             self,
             variant,
             genes,
-            gene_transcript_effects):
+            gene_transcript_effects,
+            errors={}):
         """
         variant : Variant
 
@@ -24,6 +25,10 @@ def __init__(
 
         gene_transcript_effects : dict
             Dictionary from gene ID to list of transcript variant effects
+
+        errors : dict, optional
+            Mapping from transcript objects to strings explaining error which
+            was encountered while trying to annotate them.
         """
         self.variant = variant
         self.genes = genes
@@ -43,9 +48,11 @@ def __init__(
             highest_priority_class = self.highest_priority_effect.__class__
             self.variant_summary = highest_priority_class.__name__
         else:
-            self.highest_priority_effect = _
+            self.highest_priority_effect = None
             self.variant_summary = "Intergenic"
 
+        self.errors = errors
+
     @property
     def coding_genes(self):
         """
@@ -65,6 +72,8 @@ def __str__(self):
             ("genes", [gene.name for gene in self.genes]),
             ("transcript_effects", self.transcript_effects)
         ]
+        if self.errors:
+            fields.append( ("errors", self.errors) )
         return "VariantEffect(%s)" % (
             ", ".join(["%s=%s" % (k,v) for (k,v) in fields]))