Skip to content

Commit

Permalink
Merge pull request #136 from hammerlab/fix-transcript-complete
Browse files Browse the repository at this point in the history
Transcript.coding_sequence didn't work for non-coding transcripts
  • Loading branch information
iskandr committed Feb 23, 2016
2 parents c70d32b + 133af85 commit e9ec65a
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 11 deletions.
2 changes: 2 additions & 0 deletions pyensembl/biotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@
'known_ncrna',
# unspliced lncRNAs that are several kb in size.
'macro_lncRNA',
# seems to have been added around Ensembl 81
'bidirectional_promoter_lncrna',
}

mitochondrial = {
Expand Down
7 changes: 6 additions & 1 deletion pyensembl/sequence_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ def _parse_fasta_dictionary(self):
fasta_dictionary = {}
sequence_type = self.sequence_type
for seq_entry in read(self.fasta_path, format="fasta"):
seq_id = seq_entry.metadata["id"]
# annoyingly Ensembl83 reformatted the transcript IDs of its
# cDNA FASTA to include sequence version numbers
# .e.g.
# "ENST00000448914.1" instead of "ENST00000448914"
# So now we have to parse out the identifier
seq_id = seq_entry.metadata["id"].split(".")[0]
fasta_dictionary[seq_id] = sequence_type(seq_entry)
return fasta_dictionary

Expand Down
20 changes: 12 additions & 8 deletions pyensembl/transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ def __str__(self):
" gene_name=%s,"
" biotype=%s,"
" location=%s:%d-%d)") % (
self.id,
self.name,
self.gene.id,
self.gene.name,
self.biotype,
self.contig,
self.start,
self.end)
self.id,
self.name,
self.gene.id,
self.gene.name,
self.biotype,
self.contig,
self.start,
self.end)

def __repr__(self):
return str(self)
Expand Down Expand Up @@ -411,8 +411,12 @@ def coding_sequence(self):
cDNA coding sequence (from start codon to stop codon, without
any introns)
"""
if self.sequence is None:
return None

start = self.first_start_codon_spliced_offset
end = self.last_stop_codon_spliced_offset

# If start codon is the at nucleotide offsets [3,4,5] and
# stop codon is at nucleotide offsets [20,21,22]
# then start = 3 and end = 22.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
if __name__ == '__main__':
setup(
name='pyensembl',
version="0.8.7",
version="0.8.8",
description="Python interface to ensembl reference genome metadata",
author="Alex Rubinsteyn",
author_email="alex {dot} rubinsteyn {at} mssm {dot} edu",
Expand Down
17 changes: 16 additions & 1 deletion test/test_transcript_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@

from __future__ import absolute_import
from nose.tools import eq_
from pyensembl import ensembl54
from pyensembl import cached_release

ensembl54 = cached_release(54)
ensembl83 = cached_release(83)

def test_transcript_sequence_ensembl54():
seq = ensembl54.transcript_sequence("ENST00000321606")
Expand All @@ -22,3 +25,15 @@ def test_transcript_sequence_ensembl54():
]
full_transcript_sequence = "".join(nucleotide_lines)
eq_(str(seq), full_transcript_sequence)

# now get the same sequence via a Transcript object
eq_(ensembl54.transcript_by_id("ENST00000321606").sequence, seq)


def test_transcript_sequence_ensembl83():
# extremely short TRD gene
seq = ensembl83.transcript_sequence("ENST00000448914")
expected = "ACTGGGGGATACG"
eq_(seq, expected)
# now try via a Transcript object
eq_(ensembl83.transcript_by_id("ENST00000448914").sequence, expected)

0 comments on commit e9ec65a

Please sign in to comment.