diff --git a/.travis.yml b/.travis.yml index 5b835dd..4c16be9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -36,13 +36,14 @@ install: - pip install . - pip install coveralls script: - # human releases + # older human releases - pyensembl install --release 54 --species human - pyensembl install --release 75 --species human - pyensembl install --release 77 --species human - - pyensembl install --release 81 --species human - # mouse releases - - pyensembl install --release 81 --species mouse + # latest human release + - pyensembl install --release 83 --species human + # latest mouse release + - pyensembl install --release 83 --species mouse # run tests - nosetests test --with-coverage --cover-package=pyensembl && ./lint.sh after_success: diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py index 1bd256f..f0ffc83 100644 --- a/pyensembl/__init__.py +++ b/pyensembl/__init__.py @@ -18,6 +18,7 @@ from .download_cache import DownloadCache from .ensembl_release import EnsemblRelease from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE +from .exon import Exon from .genome import Genome from .gene import Gene from .gtf import GTF @@ -57,12 +58,31 @@ def genome_for_reference_name(reference_name): ensembl_grch36 = ensembl54 = cached_release(54) # last release for GRCh36/hg18 ensembl_grch37 = ensembl75 = cached_release(75) # last release for GRCh37/hg19 +ensembl_grch38 = cached_release(MAX_ENSEMBL_RELEASE) # most recent for GRCh38 -ensembl77 = cached_release(77) -ensembl78 = cached_release(78) -ensembl79 = cached_release(79) -ensembl80 = cached_release(80) -ensembl81 = cached_release(81) -ensembl82 = cached_release(82) -ensembl83 = cached_release(83) -ensembl_grch38 = ensembl83 # most recent for GRCh38 + +__all__ = [ + "MemoryCache", + "DownloadCache", + "EnsemblRelease", + "MAX_ENSEMBL_RELEASE", + "cached_release", + "Gene", + "Transcript", + "Exon", + "SequenceData", + "find_nearest_locus", + "find_species_by_name", + "find_species_by_reference", + "which_reference", + "check_species_object", + "normalize_reference_name", + "normalize_species_name", + "Genome", + "GTF", + "Locus", + "Exon", + "ensembl_grch36", + "ensembl_grch37", + "ensembl_grch38", +] diff --git a/pyensembl/genome.py b/pyensembl/genome.py index 0695440..dad3d33 100644 --- a/pyensembl/genome.py +++ b/pyensembl/genome.py @@ -148,10 +148,10 @@ def _get_cached_path( def _get_gtf_path(self, download_if_missing=False, overwrite=False): return self._get_cached_path( - field_name="gtf", - path_or_url=self._gtf_path_or_url, - download_if_missing=download_if_missing, - overwrite=overwrite) + field_name="gtf", + path_or_url=self._gtf_path_or_url, + download_if_missing=download_if_missing, + overwrite=overwrite) def _get_transcript_fasta_path( self, @@ -465,12 +465,12 @@ def gene_ids_at_locus(self, contig, position, end=None, strand=None): def gene_names_at_locus(self, contig, position, end=None, strand=None): return self.db.distinct_column_values_at_locus( - column="gene_name", - feature="gene", - contig=contig, - position=position, - end=end, - strand=strand) + column="gene_name", + feature="gene", + contig=contig, + position=position, + end=end, + strand=strand) def exon_ids_at_locus(self, contig, position, end=None, strand=None): return self.db.distinct_column_values_at_locus( @@ -1040,4 +1040,4 @@ def protein_ids(self, contig=None, strand=None): strand=strand, distinct=True) # drop None values - return [protein_id for protein_id in protein_ids if protein_id] \ No newline at end of file + return [protein_id for protein_id in protein_ids if protein_id] diff --git a/setup.py b/setup.py index 865a4e1..7dc7b1f 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ if __name__ == '__main__': setup( name='pyensembl', - version="0.8.6", + version="0.8.7", description="Python interface to ensembl reference genome metadata", author="Alex Rubinsteyn", author_email="alex {dot} rubinsteyn {at} mssm {dot} edu", diff --git a/test/common.py b/test/common.py index d256fd3..1cecaa4 100644 --- a/test/common.py +++ b/test/common.py @@ -3,10 +3,10 @@ import functools from pyensembl import ( - ensembl_grch36, ensembl_grch37, ensembl_grch38, - cached_release + cached_release, + MAX_ENSEMBL_RELEASE, ) from nose.tools import nottest @@ -23,9 +23,12 @@ def test_ensembl_releases(*versions): Run a unit test which takes an EnsemblRelease as an argument for multiple releases (most recent for each reference genome) """ + if len(versions) == 0: ensembl_releases = major_releases else: + if any(version > MAX_ENSEMBL_RELEASE for version in versions): + raise ValueError("Invalid ensembl release numbers: %s" % (versions,)) ensembl_releases = [cached_release(version) for version in versions] def decorator(test_fn): diff --git a/test/test_ensembl_object_properties.py b/test/test_ensembl_object_properties.py index f05e80b..ff90dcf 100644 --- a/test/test_ensembl_object_properties.py +++ b/test/test_ensembl_object_properties.py @@ -6,11 +6,11 @@ from __future__ import absolute_import from nose.tools import eq_ -from pyensembl import EnsemblRelease +from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE -def test_reference_name(): +def test_human_reference_name(): eq_(EnsemblRelease(release=54).reference_name, "NCBI36") eq_(EnsemblRelease(release=74).reference_name, "GRCh37") eq_(EnsemblRelease(release=75).reference_name, "GRCh37") - eq_(EnsemblRelease(release=78).reference_name, "GRCh38") - eq_(EnsemblRelease(release=79).reference_name, "GRCh38") \ No newline at end of file + for release in range(76, MAX_ENSEMBL_RELEASE): + eq_(EnsemblRelease(release=release).reference_name, "GRCh38") diff --git a/test/test_exon_id.py b/test/test_exon_id.py index 05a88a6..ceb145f 100644 --- a/test/test_exon_id.py +++ b/test/test_exon_id.py @@ -4,7 +4,9 @@ """ from __future__ import absolute_import -from pyensembl import ensembl_grch38 as ensembl +from pyensembl import cached_release + +ensembl = cached_release(77) # all exons associated with TP53 gene in Ensembl release 77 TP53_EXON_IDS_RELEASE_77 = [ @@ -85,9 +87,8 @@ def test_exon_ids_of_transcript_name(): len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), len(exon_ids)) assert all( - exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 - for exon_id in exon_ids - ) + exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 + for exon_id in exon_ids) def exon_ids_of_transcript_id(): """ @@ -101,5 +102,5 @@ def exon_ids_of_transcript_id(): len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), len(exon_ids)) assert all( - exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 - for exon_id in exon_ids) + exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 + for exon_id in exon_ids) diff --git a/test/test_exon_object.py b/test/test_exon_object.py index b0824cb..4587284 100644 --- a/test/test_exon_object.py +++ b/test/test_exon_object.py @@ -5,7 +5,9 @@ """ from __future__ import absolute_import -from pyensembl import ensembl_grch38 as ensembl +from pyensembl import cached_release + +ensembl = cached_release(77) def test_exon_object_by_id(): """ @@ -25,8 +27,8 @@ def test_exon_object_by_id(): def test_exon_object_by_id_on_negative_strand(): """ - test_exon_object_by_id : check properties of exon 1 from CXCR3 when looked - up by ID in Ensembl 77. + test_exon_object_by_id_on_negative_strand : check properties of exon 1 + from CXCR3 when looked up by ID in Ensembl 77. """ exon = ensembl.exon_by_id("ENSE00001817013") assert exon.gene_name == "CXCR3", \ @@ -69,3 +71,22 @@ def test_exon_object_at_locus_on_negative_strand(): assert exon.on_negative_strand assert exon.start <= 71618517, "Unexpected exon start: %s" % exon.start assert exon.end >= 71618517, "Unexpected exon end: %s" % exon.end + +def test_exon_basic_properties_str(): + exon = ensembl.exon_by_id("ENSE00001817013") + assert isinstance(str(exon), str) + assert isinstance(repr(exon), str) + # for now we're assuming that __repr__ and __str__ do the same thing, + # if we later change that assumption we should do so explicitly and + # change this test + assert str(exon) == repr(exon), "%s != %s" % (str(exon), repr(exon)) + +def test_exon_basic_properties_hash(): + exon = ensembl.exon_by_id("ENSE00001817013") + assert isinstance(hash(exon), int), \ + "Hash function returns %s instead of int" % ( + type(hash(exon),)) + assert hash(exon) == hash(exon), "Hash function is non-deterministic!" + other_exon = ensembl.exon_by_id("ENSE00003464041") + assert exon != other_exon + assert hash(exon) != hash(other_exon) diff --git a/test/test_gene_ids.py b/test/test_gene_ids.py index 4c0a3c5..b121b8b 100644 --- a/test/test_gene_ids.py +++ b/test/test_gene_ids.py @@ -7,10 +7,11 @@ from __future__ import absolute_import from nose.tools import assert_raises, ok_ -from pyensembl import ensembl_grch38, ensembl77 +from pyensembl import ensembl_grch38, cached_release from .common import test_ensembl_releases +ensembl77 = cached_release(77, "human") def test_gene_ids_grch38_hla_a(): # chr6:29,945,884 is a position for HLA-A diff --git a/test/test_gene_names.py b/test/test_gene_names.py index 08b66c8..1afdb0a 100644 --- a/test/test_gene_names.py +++ b/test/test_gene_names.py @@ -7,19 +7,19 @@ from .common import test_ensembl_releases -# make sure that familia KNOWN_GENE_NAMES = [ "TP53", "ERBB2", "SMAD4", "CTAG1A", + "HLA-A", ] @test_ensembl_releases() def test_all_gene_names(ensembl): """ test_all_gene_names : Make sure some known gene names such as - SMAD4, HSP90AA1, TP53, ERBB2 + SMAD4, TP53, ERBB2, &c """ gene_names = ensembl.gene_names() print(type(gene_names)) @@ -46,3 +46,15 @@ def test_gene_names_on_contig(ensembl): assert "SMAD4" in gene_names_chr18, \ "No SMAD4 in gene names on chr18 of %s, gene names: %s ... (%d)" % ( ensembl, list(gene_names_chr18[:4]), len(gene_names_chr18)) + + +def test_gene_name_of_HLA_gene_id(): + gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A") + gene_names = [ + ensembl_grch38.gene_name_of_gene_id(gene_id) + for gene_id in gene_ids + ] + unique_gene_names = list(set(gene_names)) + assert len(unique_gene_names) == 1, (len(unique_gene_names), unique_gene_names) + gene_name = unique_gene_names[0] + assert gene_name == "HLA-A", gene_name diff --git a/test/test_gtf_path.py b/test/test_gtf_path.py index d2eca4e..1f26bec 100644 --- a/test/test_gtf_path.py +++ b/test/test_gtf_path.py @@ -35,4 +35,4 @@ def test_gtf_creates_csv_files_in_cache_dir(): # GTF parsing and then saving the parsed results in a csv file gtf_object.dataframe() assert len(glob(search_pattern)) > 0, \ - "Expected GTF to save files in cache_dir" \ No newline at end of file + "Expected GTF to save files in cache_dir" diff --git a/test/test_id_length.py b/test/test_id_length.py index 595a2a0..f6ae8ef 100644 --- a/test/test_id_length.py +++ b/test/test_id_length.py @@ -12,7 +12,7 @@ def check_id_length(method_name): idents = method(contig="Y") assert len(idents) > 0, "No values returned by %s" % method_name assert all(len(ident) == 15 for ident in idents), \ - "Invalid IDs for %s: %s" % ( + "Invalid IDs for %s: %s" % ( method_name, [ident for ident in idents if len(ident) != 15]) diff --git a/test/test_release_versions.py b/test/test_release_versions.py index fdf65bf..c9847b7 100644 --- a/test/test_release_versions.py +++ b/test/test_release_versions.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from pyensembl import EnsemblRelease +from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE from nose.tools import raises @@ -21,10 +21,17 @@ def test_version_is_not_numeric(): def test_version_is_none(): EnsemblRelease(None) +def test_max_ensembl_release(): + assert isinstance(MAX_ENSEMBL_RELEASE, int), \ + "Unexpected type for MAX_ENSEMBL_RELEASE: %s" % ( + type(MAX_ENSEMBL_RELEASE),) + assert 83 <= MAX_ENSEMBL_RELEASE < 1000, \ + "Unexpected value for MAX_ENSEMBL_RELEASE: %d" % MAX_ENSEMBL_RELEASE + def test_int_version(): - for version in range(54, 81): + for version in range(54, MAX_ENSEMBL_RELEASE): EnsemblRelease(version) def test_str_version(): - for version in range(54, 81): + for version in range(54, MAX_ENSEMBL_RELEASE): EnsemblRelease(str(version)) diff --git a/test/test_transcript_objects.py b/test/test_transcript_objects.py index 6a6714d..d804a96 100644 --- a/test/test_transcript_objects.py +++ b/test/test_transcript_objects.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from pyensembl import Locus, ensembl_grch38 +from pyensembl import Locus, cached_release from nose.tools import eq_, assert_not_equal, assert_greater from .common import test_ensembl_releases @@ -17,12 +17,14 @@ TP53_gene_id, ) +ensembl77 = cached_release(77) + def test_transcript_start_codon(): """ test_transcript_start_codon : Check that fields Transcript (for transcript named CTNNBIP1-004) matches known values. """ - CTNNBIP1_004_transcript = ensembl_grch38.transcript_by_id( + CTNNBIP1_004_transcript = ensembl77.transcript_by_id( CTNNBIP1_004_transcript_id) assert Locus.__eq__(CTNNBIP1_004_locus, CTNNBIP1_004_transcript), \ "Expected locus %s but got %s" % ( @@ -48,7 +50,7 @@ def test_transcript_exons(): test_transcript_exons : Ensure that properties of CTTNBIP1-004's Exon objects match known values. """ - transcript = ensembl_grch38.transcript_by_id(CTNNBIP1_004_transcript_id) + transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) exons = transcript.exons assert isinstance(exons, list), \ "Expected list of Exon objects, got %s : %s" % (exons, type(exons)) @@ -119,7 +121,7 @@ def test_sequence_parts(ensembl): combined_string)) def test_transcript_utr5_sequence_CTNNIP1_004(): - transcript = ensembl_grch38.transcript_by_id(CTNNBIP1_004_transcript_id) + transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) utr5 = transcript.five_prime_utr_sequence expected_utr5_length = len(CTNNBIP1_004_UTR5) eq_(len(utr5), @@ -129,7 +131,7 @@ def test_transcript_utr5_sequence_CTNNIP1_004(): eq_(utr5, CTNNBIP1_004_UTR5) def test_transcript_utr3_sequence_CTNNIP1_004(): - transcript = ensembl_grch38.transcript_by_id(CTNNBIP1_004_transcript_id) + transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) utr3 = transcript.three_prime_utr_sequence expected_utr3_length = len(CTNNBIP1_004_UTR3) eq_(len(utr3), @@ -139,7 +141,7 @@ def test_transcript_utr3_sequence_CTNNIP1_004(): eq_(utr3, CTNNBIP1_004_UTR3) def test_transcript_cds_CTNNIP1_004(): - transcript = ensembl_grch38.transcript_by_id(CTNNBIP1_004_transcript_id) + transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) cds = transcript.coding_sequence expected_cds_length = len(CTNNBIP1_004_CDS) eq_( @@ -163,14 +165,14 @@ def test_not_equal_transcripts(release): assert_not_equal(t1, t2) def test_protein_id(): - transcript = ensembl_grch38.transcripts_by_name("EGFR-001")[0] + transcript = ensembl77.transcripts_by_name("EGFR-001")[0] eq_(transcript.protein_id, "ENSP00000275493") def test_protein_protein_sequence(): - transcript = ensembl_grch38.transcripts_by_name("EGFR-001")[0] + transcript = ensembl77.transcripts_by_name("EGFR-001")[0] eq_(transcript.protein_sequence, EGFR_001_protein_sequence) def test_transcript_gene_should_match_parent_gene(): - gene = ensembl_grch38.gene_by_id(TP53_gene_id) + gene = ensembl77.gene_by_id(TP53_gene_id) for transcript in gene.transcripts: eq_(transcript.gene, gene) diff --git a/test/test_ucsc_gtf.py b/test/test_ucsc_gtf.py index 3ee4375..3c6a9fc 100644 --- a/test/test_ucsc_gtf.py +++ b/test/test_ucsc_gtf.py @@ -103,7 +103,7 @@ def test_ucsc_refseq_genome(): transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, \ - "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) + "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 2, \ "Expected 2 transcripts, got %d: %s" % ( len(transcripts), transcripts)