diff --git a/varcode/core_logic.py b/varcode/core_logic.py index d616128..05a6066 100644 --- a/varcode/core_logic.py +++ b/varcode/core_logic.py @@ -37,7 +37,7 @@ FrameShiftTruncation ) -from Bio.Seq import Seq +from Bio.Seq import Seq, CodonTable from pyensembl.transcript import Transcript from pyensembl.biotypes import is_coding_biotype @@ -83,6 +83,31 @@ def first_transcript_offset(start_pos, end_pos, transcript): return the start/end offsets of the variant relative to the chromosomal positions of the transcript. """ + # ensure that start_pos:end_pos overlap with transcript positions + assert start_pos <= end_pos, \ + "start_pos %d shouldn't be greater than end_pos %d" % ( + start_pos, end_pos) + assert start_pos <= transcript.end, \ + "Range %d:%d starts after transcript %s (%d:%d)" % ( + start_pos, + end_pos, + transcript, + transcript.start, + transcript.end) + assert end_pos >= transcript.start, \ + "Range %d:%d ends before transcript %s (%d:%d)" % ( + start_pos, + end_pos, + transcript, + transcript.start, + transcript.end) + # trim the start position to the beginning of the transcript + if start_pos < transcript.start: + start_pos = transcript.start + # trim the end position to the end of the transcript + if end_pos > transcript.end: + end_pos = transcript.end + # offsets into the spliced transcript offsets = [ transcript.spliced_offset(start_pos), @@ -157,7 +182,15 @@ def infer_coding_effect( # to amino acids. For the original CDS make sure that it starts with # a start codon and ends with a stop codon. Don't include the stop codon # in the translated sequence. - original_protein = str(Seq(cds_seq).translate(cds=True, to_stop=True)) + try: + original_protein = str(Seq(cds_seq).translate(cds=True, to_stop=True)) + except CodonTable.TranslationError as error: + # coding sequence may contain premature stop codon or may have + # an incorrectly annotated frame + logging.warning( + "Translation error in coding sequence for %s" % transcript) + logging.warning(error) + return IncompleteTranscript(variant, transcript) assert len(original_protein) > 0, \ "Translated protein sequence of %s is empty" % (transcript,) diff --git a/varcode/effect_ordering.py b/varcode/effect_ordering.py index fb451d0..61877e8 100644 --- a/varcode/effect_ordering.py +++ b/varcode/effect_ordering.py @@ -52,10 +52,18 @@ def top_priority_transcript_effect(effects): elif priority == best_priority: best_effects.append(effect) - def effect_cds_len(effect): - """ - Returns length of coding sequence of transcript associated with effect - """ - return len(effect.transcript.coding_sequence) - - return max(best_effects, key=effect_cds_len) \ No newline at end of file + if any(effect.transcript.complete for effect in best_effects): + # if any transcripts have complete coding sequence annotations, + # filter the effects down to those that are complete and sort + # them by length of the coding sequence + best_effects = [ + effect + for effect in best_effects + if effect.transcript.complete + ] + key_fn = lambda effect: len(effect.transcript.coding_sequence) + else: + # if effects are over incomplete transcripts, sort them by the + # their total transcript length + key_fn = lambda effect: len(effect.transcript) + return max(best_effects, key=key_fn) diff --git a/varcode/variant_annotator.py b/varcode/variant_annotator.py index fbaf845..3f5b995 100644 --- a/varcode/variant_annotator.py +++ b/varcode/variant_annotator.py @@ -1,4 +1,5 @@ from __future__ import print_function, division, absolute_import +import logging from .common import group_by from .core_logic import infer_transcript_effect @@ -11,7 +12,7 @@ class VariantAnnotator(object): def __init__(self, ensembl_release): self.ensembl = pyensembl.EnsemblRelease(ensembl_release) - def describe_variant(self, variant): + def describe_variant(self, variant, raise_on_error=True): """ Determine the effects of a variant on any transcripts it overlaps. Returns a VariantEffect object. @@ -22,17 +23,16 @@ def describe_variant(self, variant): ref = variant.ref alt = variant.alt - overlapping_genes = self.ensembl.genes_at_locus(contig, pos, end_pos) + overlapping_transcripts = self.ensembl.transcripts_at_locus( + contig, pos, end_pos) - if len(overlapping_genes) == 0: - return [], {} - else: - overlapping_transcripts = self.ensembl.transcripts_at_locus( - contig, pos, end_pos) - - assert len(overlapping_transcripts) > 0, \ - "No transcripts found for mutation %s:%d %s>%s" % ( - contig, pos, ref, alt) + if len(overlapping_transcripts) == 0: + # intergenic variant + return VariantEffect( + variant=variant, + genes=[], + gene_transcript_effects={}, + errors={}) # group transcripts by their gene ID overlapping_transcript_groups = group_by( @@ -40,14 +40,38 @@ def describe_variant(self, variant): # dictionary from gene ID to list of transcript effects gene_transcript_effects_groups = {} + + # mapping from Transcript objects to errors encountered + # while trying to annotated them + errors = {} + for (gene_id, transcripts) in overlapping_transcript_groups.items(): effects = [] for transcript in transcripts: - effect = infer_transcript_effect(variant, transcript) - effects.append(effect) + try: + effects.append(infer_transcript_effect(variant, transcript)) + except (AssertionError, ValueError) as error: + if raise_on_error: + raise + else: + logging.warn( + "Encountered error annotating %s for %s: %s", + variant, + transcript, + error) + errors[transcript] = error gene_transcript_effects_groups[gene_id] = effects + # construct Gene objects for all the genes containing + # all the transcripts this variant overlaps + overlapping_genes = [ + self.ensembl.gene_by_id(gene_id) + for gene_id + in overlapping_transcript_groups.keys() + ] + return VariantEffect( variant=variant, genes=overlapping_genes, - gene_transcript_effects=gene_transcript_effects_groups) + gene_transcript_effects=gene_transcript_effects_groups, + errors=errors) diff --git a/varcode/variant_collection.py b/varcode/variant_collection.py index 3761cbe..11644fb 100644 --- a/varcode/variant_collection.py +++ b/varcode/variant_collection.py @@ -82,7 +82,7 @@ def __init__(self, filename, drop_duplicates=True): elif filename.endswith(".maf"): self.raw_reference_name, self.records = _load_maf(filename) else: - raise ValueErrr("Unrecognized file type: %s" % (filename,)) + raise ValueError("Unrecognized file type: %s" % (filename,)) self.filename = filename @@ -117,13 +117,21 @@ def __str__(self): def __repr__(self): return str(self) - def variant_effects(self): + def variant_effects(self, raise_on_error=True): """ Determine the impact of each variant, return a list of VariantEffect objects. + + Parameters + ---------- + + raise_on_error : bool, optional. + Raise exception if error is encountered while annotating + transcripts, otherwise track errors in VariantEffect.errors + dictionary (default=True). """ return [ - self.annot.describe_variant(variant) + self.annot.describe_variant(variant, raise_on_error=raise_on_error) for variant in self.records ] diff --git a/varcode/variant_effect.py b/varcode/variant_effect.py index 7903d10..c10f563 100644 --- a/varcode/variant_effect.py +++ b/varcode/variant_effect.py @@ -15,7 +15,8 @@ def __init__( self, variant, genes, - gene_transcript_effects): + gene_transcript_effects, + errors={}): """ variant : Variant @@ -24,6 +25,10 @@ def __init__( gene_transcript_effects : dict Dictionary from gene ID to list of transcript variant effects + + errors : dict, optional + Mapping from transcript objects to strings explaining error which + was encountered while trying to annotate them. """ self.variant = variant self.genes = genes @@ -43,9 +48,11 @@ def __init__( highest_priority_class = self.highest_priority_effect.__class__ self.variant_summary = highest_priority_class.__name__ else: - self.highest_priority_effect = _ + self.highest_priority_effect = None self.variant_summary = "Intergenic" + self.errors = errors + @property def coding_genes(self): """ @@ -65,6 +72,8 @@ def __str__(self): ("genes", [gene.name for gene in self.genes]), ("transcript_effects", self.transcript_effects) ] + if self.errors: + fields.append( ("errors", self.errors) ) return "VariantEffect(%s)" % ( ", ".join(["%s=%s" % (k,v) for (k,v) in fields]))