fix FASTA parsing for ensembl83

openvax · Feb 23, 2016 · 133af85 · 133af85
1 parent efbd2ae
commit 133af85
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 2 deletions.
diff --git a/pyensembl/biotypes.py b/pyensembl/biotypes.py
@@ -282,6 +282,8 @@
     'known_ncrna',
     # unspliced lncRNAs that are several kb in size.
     'macro_lncRNA',
+    # seems to have been added around Ensembl 81
+    'bidirectional_promoter_lncrna',
 }
 
 mitochondrial = {

diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py
@@ -82,7 +82,12 @@ def _parse_fasta_dictionary(self):
         fasta_dictionary = {}
         sequence_type = self.sequence_type
         for seq_entry in read(self.fasta_path, format="fasta"):
-            seq_id = seq_entry.metadata["id"]
+            # annoyingly Ensembl83 reformatted the transcript IDs of its
+            # cDNA FASTA to include sequence version numbers
+            # .e.g.
+            # "ENST00000448914.1" instead of "ENST00000448914"
+            # So now we have to parse out the identifier
+            seq_id = seq_entry.metadata["id"].split(".")[0]
             fasta_dictionary[seq_id] = sequence_type(seq_entry)
         return fasta_dictionary
 

diff --git a/test/test_transcript_sequences.py b/test/test_transcript_sequences.py
@@ -5,7 +5,10 @@
 
 from __future__ import absolute_import
 from nose.tools import eq_
-from pyensembl import ensembl54
+from pyensembl import cached_release
+
+ensembl54 = cached_release(54)
+ensembl83 = cached_release(83)
 
 def test_transcript_sequence_ensembl54():
     seq = ensembl54.transcript_sequence("ENST00000321606")
@@ -22,3 +25,15 @@ def test_transcript_sequence_ensembl54():
     ]
     full_transcript_sequence = "".join(nucleotide_lines)
     eq_(str(seq), full_transcript_sequence)
+
+    # now get the same sequence via a Transcript object
+    eq_(ensembl54.transcript_by_id("ENST00000321606").sequence, seq)
+
+
+def test_transcript_sequence_ensembl83():
+    # extremely short TRD gene
+    seq = ensembl83.transcript_sequence("ENST00000448914")
+    expected = "ACTGGGGGATACG"
+    eq_(seq, expected)
+    # now try via a Transcript object
+    eq_(ensembl83.transcript_by_id("ENST00000448914").sequence, expected)