Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions varcode/core_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
FrameShiftTruncation
)

from Bio.Seq import Seq
from Bio.Seq import Seq, CodonTable
from pyensembl.transcript import Transcript
from pyensembl.biotypes import is_coding_biotype

Expand Down Expand Up @@ -83,6 +83,31 @@ def first_transcript_offset(start_pos, end_pos, transcript):
return the start/end offsets of the variant relative to the
chromosomal positions of the transcript.
"""
# ensure that start_pos:end_pos overlap with transcript positions
assert start_pos <= end_pos, \
"start_pos %d shouldn't be greater than end_pos %d" % (
start_pos, end_pos)
assert start_pos <= transcript.end, \
"Range %d:%d starts after transcript %s (%d:%d)" % (
start_pos,
end_pos,
transcript,
transcript.start,
transcript.end)
assert end_pos >= transcript.start, \
"Range %d:%d ends before transcript %s (%d:%d)" % (
start_pos,
end_pos,
transcript,
transcript.start,
transcript.end)
# trim the start position to the beginning of the transcript
if start_pos < transcript.start:
start_pos = transcript.start
# trim the end position to the end of the transcript
if end_pos > transcript.end:
end_pos = transcript.end

# offsets into the spliced transcript
offsets = [
transcript.spliced_offset(start_pos),
Expand Down Expand Up @@ -157,7 +182,15 @@ def infer_coding_effect(
# to amino acids. For the original CDS make sure that it starts with
# a start codon and ends with a stop codon. Don't include the stop codon
# in the translated sequence.
original_protein = str(Seq(cds_seq).translate(cds=True, to_stop=True))
try:
original_protein = str(Seq(cds_seq).translate(cds=True, to_stop=True))
except CodonTable.TranslationError as error:
# coding sequence may contain premature stop codon or may have
# an incorrectly annotated frame
logging.warning(
"Translation error in coding sequence for %s" % transcript)
logging.warning(error)
return IncompleteTranscript(variant, transcript)

assert len(original_protein) > 0, \
"Translated protein sequence of %s is empty" % (transcript,)
Expand Down
22 changes: 15 additions & 7 deletions varcode/effect_ordering.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,18 @@ def top_priority_transcript_effect(effects):
elif priority == best_priority:
best_effects.append(effect)

def effect_cds_len(effect):
"""
Returns length of coding sequence of transcript associated with effect
"""
return len(effect.transcript.coding_sequence)

return max(best_effects, key=effect_cds_len)
if any(effect.transcript.complete for effect in best_effects):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand the context here, but why not apply the same sorting to both (first by CDS length, then by transcript length), vs. filtering in one case and not in the other?

(Probably a silly question. I'm just not clear on the context.)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I understand the question, so I'm not sure if this clarifies the assumptions: (1) the impact on a coding transcript is more interesting than the impact on a non-coding transcript (2) lacking any other info, a longer CDS is more interesting than a shorter one, (3) non-coding transcripts don't have a CDS, so instead sort them by full transcript length.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the question is: why not leave the non-coding transcript stuff in there (when transcripts do have complete coding sequence annotations), and just have it lower in the sort order vs. filtering it out? Not saying you should. Just making sure I understand the choices.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I'm being unimaginative: how do you get the CDS length of a transcript without a CDS? np.inf? Also, is there a case where the result would be different, or are you just saying it would be more elegant to have a single sort key function?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like my comment had two parts!

Part of my thinking was the elegance of a single sort key.

But also, in terms of the difference in results: the results for the if clause can either contain both complete and incomplete transcript effects, or just complete transcript effects. Just curious about leaving the incomplete stuff in, but lower in the sort order.

Probably not worthy of too much conversationing :)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...but why leave in incomplete transcripts if we know for sure we're not going to use them? We can't compare their CDS length, so the key would have to be something like:

def key_fn(effect): 
   c = effect.transcript.complete
   cds_len = len(effect.transcript.coding_sequence) if c else 0
   return c, cds_len

I find that less clear than explicitly dropping the incomplete terms.

If we wanted to unify both branches we'd need something like:

def key_fn(effect): 
   c = effect.transcript.complete
   cds_len = len(effect.transcript.coding_sequence) if c else 0
   return c, cds_len, len(transcript)

Which is more elegant but I think less clear on what's happening.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just wasn't clear how/when we'd use the incomplete transcripts in the else case, and bubbled that lack of clarity to ask about it for the if case.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the transcripts in the else case e.g. Incomplete, Noncoding, or something else without a CDS. If we have at least one coding transcript, we should ignore all those without a CDS. Does that make sense?

(also, this is an axiom/assumption, not necessarily something well grounded in evidence. I could imagine a long ncRNA being the most important transcript of a gene)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, that logic makes sense to me -- I just wanted to hear more about the assumption it was based on

# if any transcripts have complete coding sequence annotations,
# filter the effects down to those that are complete and sort
# them by length of the coding sequence
best_effects = [
effect
for effect in best_effects
if effect.transcript.complete
]
key_fn = lambda effect: len(effect.transcript.coding_sequence)
else:
# if effects are over incomplete transcripts, sort them by the
# their total transcript length
key_fn = lambda effect: len(effect.transcript)
return max(best_effects, key=key_fn)
52 changes: 38 additions & 14 deletions varcode/variant_annotator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import print_function, division, absolute_import
import logging

from .common import group_by
from .core_logic import infer_transcript_effect
Expand All @@ -11,7 +12,7 @@ class VariantAnnotator(object):
def __init__(self, ensembl_release):
self.ensembl = pyensembl.EnsemblRelease(ensembl_release)

def describe_variant(self, variant):
def describe_variant(self, variant, raise_on_error=True):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should raise_on_error be in the VariantAnnotator constructor instead of peppered across different methods?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The most common use case is VariantCollection which uses VariantAnnotator under the hood. Not sure how to economize on raise_on_error since you'll need one arg in VariantCollection and another in VariantAnnotator.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I figured VariantCollection would pass it through to VariantAnnotator. The constructor just felt more appropriate to me, based on knowing far less about the code than you do.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The constructor feels a little clumsier to me since the typical usecase seems to be:

  1. Construct variant_collection = VariantCollection(some_vcf_path)
  2. Call variant_collection.variant_effects()
  3. Encounter an error
  4. Decide the error is irrelevant
  5. Call variant_collection.variant_effects(raise_on_error=False) to ignore the error

Having to create a new VariantCollection seems like a very minor hassle but it also doesn't have a clear payoff.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

"""
Determine the effects of a variant on any transcripts it overlaps.
Returns a VariantEffect object.
Expand All @@ -22,32 +23,55 @@ def describe_variant(self, variant):
ref = variant.ref
alt = variant.alt

overlapping_genes = self.ensembl.genes_at_locus(contig, pos, end_pos)
overlapping_transcripts = self.ensembl.transcripts_at_locus(
contig, pos, end_pos)

if len(overlapping_genes) == 0:
return [], {}
else:
overlapping_transcripts = self.ensembl.transcripts_at_locus(
contig, pos, end_pos)

assert len(overlapping_transcripts) > 0, \
"No transcripts found for mutation %s:%d %s>%s" % (
contig, pos, ref, alt)
if len(overlapping_transcripts) == 0:
# intergenic variant
return VariantEffect(
variant=variant,
genes=[],
gene_transcript_effects={},
errors={})

# group transcripts by their gene ID
overlapping_transcript_groups = group_by(
overlapping_transcripts, field_name='gene_id')

# dictionary from gene ID to list of transcript effects
gene_transcript_effects_groups = {}

# mapping from Transcript objects to errors encountered
# while trying to annotated them
errors = {}

for (gene_id, transcripts) in overlapping_transcript_groups.items():
effects = []
for transcript in transcripts:
effect = infer_transcript_effect(variant, transcript)
effects.append(effect)
try:
effects.append(infer_transcript_effect(variant, transcript))
except (AssertionError, ValueError) as error:
if raise_on_error:
raise
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No error for the raise?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean?

(raise by itself simply re-raises the capture exception)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't realize it did that :)

else:
logging.warn(
"Encountered error annotating %s for %s: %s",
variant,
transcript,
error)
errors[transcript] = error
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not tie each error directly to the transcript effect inference that it came from, vs. having a parallel mapping?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what you mean by "the transcript effect inference that it came from", can you say more?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you error on an infer_transcript_effect(variant, transcript), you collect all the errors into a map from transcript to error. Thoughts on putting the error in the transcript effect object itself, instead of a map from transcript to error in the variant effect object?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any ideas on what these transcript effects would look like? Would I just have a generic ErrorEffect object? Maybe once the class of errors is narrowed down to some well understood types of mistakes in the annotation database I can make those into different Effect types, for now I'm just catching ValueErrors and AssertionErrors from many different sources and don't have any extra information to unify them other than "something went wrong while trying to annotate this variant".

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha, it wasn't entirely clear to me that we wouldn't have a TranscriptEffect object because we errored in creating it.

Sure, ErrorEffect could work. I don't feel strongly, though. Feel free to carry on :)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's not really possible to make a e.g. Substitution if you raise
an exception while trying to get the original amino acids you're
substituting, the necessary information never gets computed.

On Mon, Feb 16, 2015 at 9:35 PM, Tavi Nathanson notifications@github.com
wrote:

In varcode/variant_annotator.py
#13 (comment):

     for (gene_id, transcripts) in overlapping_transcript_groups.items():
         effects = []
         for transcript in transcripts:
  •            effect = infer_transcript_effect(variant, transcript)
    
  •            effects.append(effect)
    
  •            try:
    
  •                effects.append(infer_transcript_effect(variant, transcript))
    
  •            except (AssertionError, ValueError) as error:
    
  •                if raise_on_error:
    
  •                    raise
    
  •                else:
    
  •                    logging.warn(
    
  •                        "Encountered error annotating %s for %s: %s",
    
  •                        variant,
    
  •                        transcript,
    
  •                        error)
    
  •                errors[transcript] = error
    

Aha, it wasn't entirely clear to me that we wouldn't have a
TranscriptEffect object because we errored in creating it.

Sure, ErrorEffect could work. I don't feel strongly, though. Feel free to
carry on :)


Reply to this email directly or view it on GitHub
https://github.com/hammerlab/varcode/pull/13/files#r24789064.

gene_transcript_effects_groups[gene_id] = effects

# construct Gene objects for all the genes containing
# all the transcripts this variant overlaps
overlapping_genes = [
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, in practice, the difference between this result and self.ensembl.genes_at_locus(contig, pos, end_pos) is what? I'm a bit confused.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just thought it was inelegant (and a little wasteful) to do two queries to see what's overlapping the mutated locus. So, instead, I just look up the overlapping transcripts and construct the Gene objects from the transcripts.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, so it's the same result

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, just computed via one overlap query instead of two.

self.ensembl.gene_by_id(gene_id)
for gene_id
in overlapping_transcript_groups.keys()
]

return VariantEffect(
variant=variant,
genes=overlapping_genes,
gene_transcript_effects=gene_transcript_effects_groups)
gene_transcript_effects=gene_transcript_effects_groups,
errors=errors)
14 changes: 11 additions & 3 deletions varcode/variant_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def __init__(self, filename, drop_duplicates=True):
elif filename.endswith(".maf"):
self.raw_reference_name, self.records = _load_maf(filename)
else:
raise ValueErrr("Unrecognized file type: %s" % (filename,))
raise ValueError("Unrecognized file type: %s" % (filename,))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I prefer ValueErrr!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agrrree!



self.filename = filename
Expand Down Expand Up @@ -117,13 +117,21 @@ def __str__(self):
def __repr__(self):
return str(self)

def variant_effects(self):
def variant_effects(self, raise_on_error=True):
"""
Determine the impact of each variant, return a list of
VariantEffect objects.

Parameters
----------

raise_on_error : bool, optional.
Raise exception if error is encountered while annotating
transcripts, otherwise track errors in VariantEffect.errors
dictionary (default=True).
"""
return [
self.annot.describe_variant(variant)
self.annot.describe_variant(variant, raise_on_error=raise_on_error)
for variant
in self.records
]
Expand Down
13 changes: 11 additions & 2 deletions varcode/variant_effect.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def __init__(
self,
variant,
genes,
gene_transcript_effects):
gene_transcript_effects,
errors={}):
"""
variant : Variant

Expand All @@ -24,6 +25,10 @@ def __init__(

gene_transcript_effects : dict
Dictionary from gene ID to list of transcript variant effects

errors : dict, optional
Mapping from transcript objects to strings explaining error which
was encountered while trying to annotate them.
"""
self.variant = variant
self.genes = genes
Expand All @@ -43,9 +48,11 @@ def __init__(
highest_priority_class = self.highest_priority_effect.__class__
self.variant_summary = highest_priority_class.__name__
else:
self.highest_priority_effect = _
self.highest_priority_effect = None
self.variant_summary = "Intergenic"

self.errors = errors

@property
def coding_genes(self):
"""
Expand All @@ -65,6 +72,8 @@ def __str__(self):
("genes", [gene.name for gene in self.genes]),
("transcript_effects", self.transcript_effects)
]
if self.errors:
fields.append( ("errors", self.errors) )
return "VariantEffect(%s)" % (
", ".join(["%s=%s" % (k,v) for (k,v) in fields]))

Expand Down