Skip to content

Commit

Permalink
Merge pull request #135 from hammerlab/fewer-default-ensembl-release-…
Browse files Browse the repository at this point in the history
…objects

Fix Ensembl release 83
  • Loading branch information
iskandr committed Feb 22, 2016
2 parents 7c610ad + 5be572a commit c70d32b
Show file tree
Hide file tree
Showing 15 changed files with 125 additions and 57 deletions.
9 changes: 5 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@ install:
- pip install .
- pip install coveralls
script:
# human releases
# older human releases
- pyensembl install --release 54 --species human
- pyensembl install --release 75 --species human
- pyensembl install --release 77 --species human
- pyensembl install --release 81 --species human
# mouse releases
- pyensembl install --release 81 --species mouse
# latest human release
- pyensembl install --release 83 --species human
# latest mouse release
- pyensembl install --release 83 --species mouse
# run tests
- nosetests test --with-coverage --cover-package=pyensembl && ./lint.sh
after_success:
Expand Down
36 changes: 28 additions & 8 deletions pyensembl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .download_cache import DownloadCache
from .ensembl_release import EnsemblRelease
from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE
from .exon import Exon
from .genome import Genome
from .gene import Gene
from .gtf import GTF
Expand Down Expand Up @@ -57,12 +58,31 @@ def genome_for_reference_name(reference_name):

ensembl_grch36 = ensembl54 = cached_release(54) # last release for GRCh36/hg18
ensembl_grch37 = ensembl75 = cached_release(75) # last release for GRCh37/hg19
ensembl_grch38 = cached_release(MAX_ENSEMBL_RELEASE) # most recent for GRCh38

ensembl77 = cached_release(77)
ensembl78 = cached_release(78)
ensembl79 = cached_release(79)
ensembl80 = cached_release(80)
ensembl81 = cached_release(81)
ensembl82 = cached_release(82)
ensembl83 = cached_release(83)
ensembl_grch38 = ensembl83 # most recent for GRCh38

__all__ = [
"MemoryCache",
"DownloadCache",
"EnsemblRelease",
"MAX_ENSEMBL_RELEASE",
"cached_release",
"Gene",
"Transcript",
"Exon",
"SequenceData",
"find_nearest_locus",
"find_species_by_name",
"find_species_by_reference",
"which_reference",
"check_species_object",
"normalize_reference_name",
"normalize_species_name",
"Genome",
"GTF",
"Locus",
"Exon",
"ensembl_grch36",
"ensembl_grch37",
"ensembl_grch38",
]
22 changes: 11 additions & 11 deletions pyensembl/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,10 @@ def _get_cached_path(

def _get_gtf_path(self, download_if_missing=False, overwrite=False):
return self._get_cached_path(
field_name="gtf",
path_or_url=self._gtf_path_or_url,
download_if_missing=download_if_missing,
overwrite=overwrite)
field_name="gtf",
path_or_url=self._gtf_path_or_url,
download_if_missing=download_if_missing,
overwrite=overwrite)

def _get_transcript_fasta_path(
self,
Expand Down Expand Up @@ -465,12 +465,12 @@ def gene_ids_at_locus(self, contig, position, end=None, strand=None):

def gene_names_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
column="gene_name",
feature="gene",
contig=contig,
position=position,
end=end,
strand=strand)
column="gene_name",
feature="gene",
contig=contig,
position=position,
end=end,
strand=strand)

def exon_ids_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
Expand Down Expand Up @@ -1040,4 +1040,4 @@ def protein_ids(self, contig=None, strand=None):
strand=strand,
distinct=True)
# drop None values
return [protein_id for protein_id in protein_ids if protein_id]
return [protein_id for protein_id in protein_ids if protein_id]
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
if __name__ == '__main__':
setup(
name='pyensembl',
version="0.8.6",
version="0.8.7",
description="Python interface to ensembl reference genome metadata",
author="Alex Rubinsteyn",
author_email="alex {dot} rubinsteyn {at} mssm {dot} edu",
Expand Down
7 changes: 5 additions & 2 deletions test/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import functools

from pyensembl import (
ensembl_grch36,
ensembl_grch37,
ensembl_grch38,
cached_release
cached_release,
MAX_ENSEMBL_RELEASE,
)
from nose.tools import nottest

Expand All @@ -23,9 +23,12 @@ def test_ensembl_releases(*versions):
Run a unit test which takes an EnsemblRelease as an argument
for multiple releases (most recent for each reference genome)
"""

if len(versions) == 0:
ensembl_releases = major_releases
else:
if any(version > MAX_ENSEMBL_RELEASE for version in versions):
raise ValueError("Invalid ensembl release numbers: %s" % (versions,))
ensembl_releases = [cached_release(version) for version in versions]

def decorator(test_fn):
Expand Down
8 changes: 4 additions & 4 deletions test/test_ensembl_object_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
from __future__ import absolute_import

from nose.tools import eq_
from pyensembl import EnsemblRelease
from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE

def test_reference_name():
def test_human_reference_name():
eq_(EnsemblRelease(release=54).reference_name, "NCBI36")
eq_(EnsemblRelease(release=74).reference_name, "GRCh37")
eq_(EnsemblRelease(release=75).reference_name, "GRCh37")
eq_(EnsemblRelease(release=78).reference_name, "GRCh38")
eq_(EnsemblRelease(release=79).reference_name, "GRCh38")
for release in range(76, MAX_ENSEMBL_RELEASE):
eq_(EnsemblRelease(release=release).reference_name, "GRCh38")
13 changes: 7 additions & 6 deletions test/test_exon_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
"""
from __future__ import absolute_import

from pyensembl import ensembl_grch38 as ensembl
from pyensembl import cached_release

ensembl = cached_release(77)

# all exons associated with TP53 gene in Ensembl release 77
TP53_EXON_IDS_RELEASE_77 = [
Expand Down Expand Up @@ -85,9 +87,8 @@ def test_exon_ids_of_transcript_name():
len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77),
len(exon_ids))
assert all(
exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
for exon_id in exon_ids
)
exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
for exon_id in exon_ids)

def exon_ids_of_transcript_id():
"""
Expand All @@ -101,5 +102,5 @@ def exon_ids_of_transcript_id():
len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77),
len(exon_ids))
assert all(
exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
for exon_id in exon_ids)
exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
for exon_id in exon_ids)
27 changes: 24 additions & 3 deletions test/test_exon_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
"""
from __future__ import absolute_import

from pyensembl import ensembl_grch38 as ensembl
from pyensembl import cached_release

ensembl = cached_release(77)

def test_exon_object_by_id():
"""
Expand All @@ -25,8 +27,8 @@ def test_exon_object_by_id():

def test_exon_object_by_id_on_negative_strand():
"""
test_exon_object_by_id : check properties of exon 1 from CXCR3 when looked
up by ID in Ensembl 77.
test_exon_object_by_id_on_negative_strand : check properties of exon 1
from CXCR3 when looked up by ID in Ensembl 77.
"""
exon = ensembl.exon_by_id("ENSE00001817013")
assert exon.gene_name == "CXCR3", \
Expand Down Expand Up @@ -69,3 +71,22 @@ def test_exon_object_at_locus_on_negative_strand():
assert exon.on_negative_strand
assert exon.start <= 71618517, "Unexpected exon start: %s" % exon.start
assert exon.end >= 71618517, "Unexpected exon end: %s" % exon.end

def test_exon_basic_properties_str():
exon = ensembl.exon_by_id("ENSE00001817013")
assert isinstance(str(exon), str)
assert isinstance(repr(exon), str)
# for now we're assuming that __repr__ and __str__ do the same thing,
# if we later change that assumption we should do so explicitly and
# change this test
assert str(exon) == repr(exon), "%s != %s" % (str(exon), repr(exon))

def test_exon_basic_properties_hash():
exon = ensembl.exon_by_id("ENSE00001817013")
assert isinstance(hash(exon), int), \
"Hash function returns %s instead of int" % (
type(hash(exon),))
assert hash(exon) == hash(exon), "Hash function is non-deterministic!"
other_exon = ensembl.exon_by_id("ENSE00003464041")
assert exon != other_exon
assert hash(exon) != hash(other_exon)
3 changes: 2 additions & 1 deletion test/test_gene_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from __future__ import absolute_import

from nose.tools import assert_raises, ok_
from pyensembl import ensembl_grch38, ensembl77
from pyensembl import ensembl_grch38, cached_release

from .common import test_ensembl_releases

ensembl77 = cached_release(77, "human")

def test_gene_ids_grch38_hla_a():
# chr6:29,945,884 is a position for HLA-A
Expand Down
16 changes: 14 additions & 2 deletions test/test_gene_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@

from .common import test_ensembl_releases

# make sure that familia
KNOWN_GENE_NAMES = [
"TP53",
"ERBB2",
"SMAD4",
"CTAG1A",
"HLA-A",
]

@test_ensembl_releases()
def test_all_gene_names(ensembl):
"""
test_all_gene_names : Make sure some known gene names such as
SMAD4, HSP90AA1, TP53, ERBB2
SMAD4, TP53, ERBB2, &c
"""
gene_names = ensembl.gene_names()
print(type(gene_names))
Expand All @@ -46,3 +46,15 @@ def test_gene_names_on_contig(ensembl):
assert "SMAD4" in gene_names_chr18, \
"No SMAD4 in gene names on chr18 of %s, gene names: %s ... (%d)" % (
ensembl, list(gene_names_chr18[:4]), len(gene_names_chr18))


def test_gene_name_of_HLA_gene_id():
gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A")
gene_names = [
ensembl_grch38.gene_name_of_gene_id(gene_id)
for gene_id in gene_ids
]
unique_gene_names = list(set(gene_names))
assert len(unique_gene_names) == 1, (len(unique_gene_names), unique_gene_names)
gene_name = unique_gene_names[0]
assert gene_name == "HLA-A", gene_name
2 changes: 1 addition & 1 deletion test/test_gtf_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ def test_gtf_creates_csv_files_in_cache_dir():
# GTF parsing and then saving the parsed results in a csv file
gtf_object.dataframe()
assert len(glob(search_pattern)) > 0, \
"Expected GTF to save files in cache_dir"
"Expected GTF to save files in cache_dir"
2 changes: 1 addition & 1 deletion test/test_id_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def check_id_length(method_name):
idents = method(contig="Y")
assert len(idents) > 0, "No values returned by %s" % method_name
assert all(len(ident) == 15 for ident in idents), \
"Invalid IDs for %s: %s" % (
"Invalid IDs for %s: %s" % (
method_name,
[ident for ident in idents if len(ident) != 15])

Expand Down
13 changes: 10 additions & 3 deletions test/test_release_versions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import absolute_import

from pyensembl import EnsemblRelease
from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE

from nose.tools import raises

Expand All @@ -21,10 +21,17 @@ def test_version_is_not_numeric():
def test_version_is_none():
EnsemblRelease(None)

def test_max_ensembl_release():
assert isinstance(MAX_ENSEMBL_RELEASE, int), \
"Unexpected type for MAX_ENSEMBL_RELEASE: %s" % (
type(MAX_ENSEMBL_RELEASE),)
assert 83 <= MAX_ENSEMBL_RELEASE < 1000, \
"Unexpected value for MAX_ENSEMBL_RELEASE: %d" % MAX_ENSEMBL_RELEASE

def test_int_version():
for version in range(54, 81):
for version in range(54, MAX_ENSEMBL_RELEASE):
EnsemblRelease(version)

def test_str_version():
for version in range(54, 81):
for version in range(54, MAX_ENSEMBL_RELEASE):
EnsemblRelease(str(version))
Loading

0 comments on commit c70d32b

Please sign in to comment.