From efbd2ae1e07f18261211437c970e84f91371ed73 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Tue, 23 Feb 2016 16:04:43 -0500 Subject: [PATCH 1/2] Transcript.coding_sequence didn't work for non-coding transcripts, should return None --- pyensembl/transcript.py | 20 ++++++++++++-------- setup.py | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pyensembl/transcript.py b/pyensembl/transcript.py index 007f018..eb7fdde 100644 --- a/pyensembl/transcript.py +++ b/pyensembl/transcript.py @@ -62,14 +62,14 @@ def __str__(self): " gene_name=%s," " biotype=%s," " location=%s:%d-%d)") % ( - self.id, - self.name, - self.gene.id, - self.gene.name, - self.biotype, - self.contig, - self.start, - self.end) + self.id, + self.name, + self.gene.id, + self.gene.name, + self.biotype, + self.contig, + self.start, + self.end) def __repr__(self): return str(self) @@ -411,8 +411,12 @@ def coding_sequence(self): cDNA coding sequence (from start codon to stop codon, without any introns) """ + if self.sequence is None: + return None + start = self.first_start_codon_spliced_offset end = self.last_stop_codon_spliced_offset + # If start codon is the at nucleotide offsets [3,4,5] and # stop codon is at nucleotide offsets [20,21,22] # then start = 3 and end = 22. diff --git a/setup.py b/setup.py index 7dc7b1f..6a91ba0 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ if __name__ == '__main__': setup( name='pyensembl', - version="0.8.7", + version="0.8.8", description="Python interface to ensembl reference genome metadata", author="Alex Rubinsteyn", author_email="alex {dot} rubinsteyn {at} mssm {dot} edu", From 133af8548ccfdb697c020f5fc4a219fb80c22abb Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Tue, 23 Feb 2016 17:08:48 -0500 Subject: [PATCH 2/2] fix FASTA parsing for ensembl83 --- pyensembl/biotypes.py | 2 ++ pyensembl/sequence_data.py | 7 ++++++- test/test_transcript_sequences.py | 17 ++++++++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pyensembl/biotypes.py b/pyensembl/biotypes.py index 957abe4..3488f39 100644 --- a/pyensembl/biotypes.py +++ b/pyensembl/biotypes.py @@ -282,6 +282,8 @@ 'known_ncrna', # unspliced lncRNAs that are several kb in size. 'macro_lncRNA', + # seems to have been added around Ensembl 81 + 'bidirectional_promoter_lncrna', } mitochondrial = { diff --git a/pyensembl/sequence_data.py b/pyensembl/sequence_data.py index d70e2af..5226123 100644 --- a/pyensembl/sequence_data.py +++ b/pyensembl/sequence_data.py @@ -82,7 +82,12 @@ def _parse_fasta_dictionary(self): fasta_dictionary = {} sequence_type = self.sequence_type for seq_entry in read(self.fasta_path, format="fasta"): - seq_id = seq_entry.metadata["id"] + # annoyingly Ensembl83 reformatted the transcript IDs of its + # cDNA FASTA to include sequence version numbers + # .e.g. + # "ENST00000448914.1" instead of "ENST00000448914" + # So now we have to parse out the identifier + seq_id = seq_entry.metadata["id"].split(".")[0] fasta_dictionary[seq_id] = sequence_type(seq_entry) return fasta_dictionary diff --git a/test/test_transcript_sequences.py b/test/test_transcript_sequences.py index d2caaed..dd25bfa 100644 --- a/test/test_transcript_sequences.py +++ b/test/test_transcript_sequences.py @@ -5,7 +5,10 @@ from __future__ import absolute_import from nose.tools import eq_ -from pyensembl import ensembl54 +from pyensembl import cached_release + +ensembl54 = cached_release(54) +ensembl83 = cached_release(83) def test_transcript_sequence_ensembl54(): seq = ensembl54.transcript_sequence("ENST00000321606") @@ -22,3 +25,15 @@ def test_transcript_sequence_ensembl54(): ] full_transcript_sequence = "".join(nucleotide_lines) eq_(str(seq), full_transcript_sequence) + + # now get the same sequence via a Transcript object + eq_(ensembl54.transcript_by_id("ENST00000321606").sequence, seq) + + +def test_transcript_sequence_ensembl83(): + # extremely short TRD gene + seq = ensembl83.transcript_sequence("ENST00000448914") + expected = "ACTGGGGGATACG" + eq_(seq, expected) + # now try via a Transcript object + eq_(ensembl83.transcript_by_id("ENST00000448914").sequence, expected)