Merge pull request #136 from hammerlab/fix-transcript-complete

Transcript.coding_sequence didn't work for non-coding transcripts
openvax · Feb 23, 2016 · e9ec65a · e9ec65a
2 parents c70d32b + 133af85
commit e9ec65a
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 11 deletions.
diff --git a/pyensembl/biotypes.py b/pyensembl/biotypes.py
@@ -282,6 +282,8 @@
     'known_ncrna',
     # unspliced lncRNAs that are several kb in size.
     'macro_lncRNA',
+    # seems to have been added around Ensembl 81
+    'bidirectional_promoter_lncrna',
 }
 
 mitochondrial = {

diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py
@@ -82,7 +82,12 @@ def _parse_fasta_dictionary(self):
         fasta_dictionary = {}
         sequence_type = self.sequence_type
         for seq_entry in read(self.fasta_path, format="fasta"):
-            seq_id = seq_entry.metadata["id"]
+            # annoyingly Ensembl83 reformatted the transcript IDs of its
+            # cDNA FASTA to include sequence version numbers
+            # .e.g.
+            # "ENST00000448914.1" instead of "ENST00000448914"
+            # So now we have to parse out the identifier
+            seq_id = seq_entry.metadata["id"].split(".")[0]
             fasta_dictionary[seq_id] = sequence_type(seq_entry)
         return fasta_dictionary
 

diff --git a/pyensembl/transcript.py b/pyensembl/transcript.py
@@ -62,14 +62,14 @@ def __str__(self):
             " gene_name=%s,"
             " biotype=%s,"
             " location=%s:%d-%d)") % (
-                    self.id,
-                    self.name,
-                    self.gene.id,
-                    self.gene.name,
-                    self.biotype,
-                    self.contig,
-                    self.start,
-                    self.end)
+                self.id,
+                self.name,
+                self.gene.id,
+                self.gene.name,
+                self.biotype,
+                self.contig,
+                self.start,
+                self.end)
 
     def __repr__(self):
         return str(self)
@@ -411,8 +411,12 @@ def coding_sequence(self):
         cDNA coding sequence (from start codon to stop codon, without
         any introns)
         """
+        if self.sequence is None:
+            return None
+
         start = self.first_start_codon_spliced_offset
         end = self.last_stop_codon_spliced_offset
+
         # If start codon is the at nucleotide offsets [3,4,5] and
         # stop codon is at nucleotide offsets  [20,21,22]
         # then start = 3 and end = 22.

diff --git a/setup.py b/setup.py
@@ -40,7 +40,7 @@
 if __name__ == '__main__':
     setup(
         name='pyensembl',
-        version="0.8.7",
+        version="0.8.8",
         description="Python interface to ensembl reference genome metadata",
         author="Alex Rubinsteyn",
         author_email="alex {dot} rubinsteyn {at} mssm {dot} edu",

diff --git a/test/test_transcript_sequences.py b/test/test_transcript_sequences.py
@@ -5,7 +5,10 @@
 
 from __future__ import absolute_import
 from nose.tools import eq_
-from pyensembl import ensembl54
+from pyensembl import cached_release
+
+ensembl54 = cached_release(54)
+ensembl83 = cached_release(83)
 
 def test_transcript_sequence_ensembl54():
     seq = ensembl54.transcript_sequence("ENST00000321606")
@@ -22,3 +25,15 @@ def test_transcript_sequence_ensembl54():
     ]
     full_transcript_sequence = "".join(nucleotide_lines)
     eq_(str(seq), full_transcript_sequence)
+
+    # now get the same sequence via a Transcript object
+    eq_(ensembl54.transcript_by_id("ENST00000321606").sequence, seq)
+
+
+def test_transcript_sequence_ensembl83():
+    # extremely short TRD gene
+    seq = ensembl83.transcript_sequence("ENST00000448914")
+    expected = "ACTGGGGGATACG"
+    eq_(seq, expected)
+    # now try via a Transcript object
+    eq_(ensembl83.transcript_by_id("ENST00000448914").sequence, expected)