Skip to content

Commit

Permalink
fix FASTA parsing for ensembl83
Browse files Browse the repository at this point in the history
  • Loading branch information
iskandr committed Feb 23, 2016
1 parent efbd2ae commit 133af85
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 2 deletions.
2 changes: 2 additions & 0 deletions pyensembl/biotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@
'known_ncrna',
# unspliced lncRNAs that are several kb in size.
'macro_lncRNA',
# seems to have been added around Ensembl 81
'bidirectional_promoter_lncrna',
}

mitochondrial = {
Expand Down
7 changes: 6 additions & 1 deletion pyensembl/sequence_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ def _parse_fasta_dictionary(self):
fasta_dictionary = {}
sequence_type = self.sequence_type
for seq_entry in read(self.fasta_path, format="fasta"):
seq_id = seq_entry.metadata["id"]
# annoyingly Ensembl83 reformatted the transcript IDs of its
# cDNA FASTA to include sequence version numbers
# .e.g.
# "ENST00000448914.1" instead of "ENST00000448914"
# So now we have to parse out the identifier
seq_id = seq_entry.metadata["id"].split(".")[0]
fasta_dictionary[seq_id] = sequence_type(seq_entry)
return fasta_dictionary

Expand Down
17 changes: 16 additions & 1 deletion test/test_transcript_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@

from __future__ import absolute_import
from nose.tools import eq_
from pyensembl import ensembl54
from pyensembl import cached_release

ensembl54 = cached_release(54)
ensembl83 = cached_release(83)

def test_transcript_sequence_ensembl54():
seq = ensembl54.transcript_sequence("ENST00000321606")
Expand All @@ -22,3 +25,15 @@ def test_transcript_sequence_ensembl54():
]
full_transcript_sequence = "".join(nucleotide_lines)
eq_(str(seq), full_transcript_sequence)

# now get the same sequence via a Transcript object
eq_(ensembl54.transcript_by_id("ENST00000321606").sequence, seq)


def test_transcript_sequence_ensembl83():
# extremely short TRD gene
seq = ensembl83.transcript_sequence("ENST00000448914")
expected = "ACTGGGGGATACG"
eq_(seq, expected)
# now try via a Transcript object
eq_(ensembl83.transcript_by_id("ENST00000448914").sequence, expected)

0 comments on commit 133af85

Please sign in to comment.