Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Ensembl release 83 #135

Merged
merged 1 commit into from
Feb 22, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@ install:
- pip install .
- pip install coveralls
script:
# human releases
# older human releases
- pyensembl install --release 54 --species human
- pyensembl install --release 75 --species human
- pyensembl install --release 77 --species human
- pyensembl install --release 81 --species human
# mouse releases
- pyensembl install --release 81 --species mouse
# latest human release
- pyensembl install --release 83 --species human
# latest mouse release
- pyensembl install --release 83 --species mouse
# run tests
- nosetests test --with-coverage --cover-package=pyensembl && ./lint.sh
after_success:
Expand Down
36 changes: 28 additions & 8 deletions pyensembl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .download_cache import DownloadCache
from .ensembl_release import EnsemblRelease
from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE
from .exon import Exon
from .genome import Genome
from .gene import Gene
from .gtf import GTF
Expand Down Expand Up @@ -57,12 +58,31 @@ def genome_for_reference_name(reference_name):

ensembl_grch36 = ensembl54 = cached_release(54) # last release for GRCh36/hg18
ensembl_grch37 = ensembl75 = cached_release(75) # last release for GRCh37/hg19
ensembl_grch38 = cached_release(MAX_ENSEMBL_RELEASE) # most recent for GRCh38

ensembl77 = cached_release(77)
ensembl78 = cached_release(78)
ensembl79 = cached_release(79)
ensembl80 = cached_release(80)
ensembl81 = cached_release(81)
ensembl82 = cached_release(82)
ensembl83 = cached_release(83)
ensembl_grch38 = ensembl83 # most recent for GRCh38

__all__ = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know much about __all__ other than some brief Googling; any particular reason you decided to add this now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought it might be slightly better package etiquette to restrict which things get imported as '*'. I'm not totally sure if it's necessary though.

https://docs.python.org/2/tutorial/modules.html#importing-from-a-package

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

"MemoryCache",
"DownloadCache",
"EnsemblRelease",
"MAX_ENSEMBL_RELEASE",
"cached_release",
"Gene",
"Transcript",
"Exon",
"SequenceData",
"find_nearest_locus",
"find_species_by_name",
"find_species_by_reference",
"which_reference",
"check_species_object",
"normalize_reference_name",
"normalize_species_name",
"Genome",
"GTF",
"Locus",
"Exon",
"ensembl_grch36",
"ensembl_grch37",
"ensembl_grch38",
]
22 changes: 11 additions & 11 deletions pyensembl/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,10 @@ def _get_cached_path(

def _get_gtf_path(self, download_if_missing=False, overwrite=False):
return self._get_cached_path(
field_name="gtf",
path_or_url=self._gtf_path_or_url,
download_if_missing=download_if_missing,
overwrite=overwrite)
field_name="gtf",
path_or_url=self._gtf_path_or_url,
download_if_missing=download_if_missing,
overwrite=overwrite)

def _get_transcript_fasta_path(
self,
Expand Down Expand Up @@ -465,12 +465,12 @@ def gene_ids_at_locus(self, contig, position, end=None, strand=None):

def gene_names_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
column="gene_name",
feature="gene",
contig=contig,
position=position,
end=end,
strand=strand)
column="gene_name",
feature="gene",
contig=contig,
position=position,
end=end,
strand=strand)

def exon_ids_at_locus(self, contig, position, end=None, strand=None):
return self.db.distinct_column_values_at_locus(
Expand Down Expand Up @@ -1040,4 +1040,4 @@ def protein_ids(self, contig=None, strand=None):
strand=strand,
distinct=True)
# drop None values
return [protein_id for protein_id in protein_ids if protein_id]
return [protein_id for protein_id in protein_ids if protein_id]
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
if __name__ == '__main__':
setup(
name='pyensembl',
version="0.8.6",
version="0.8.7",
description="Python interface to ensembl reference genome metadata",
author="Alex Rubinsteyn",
author_email="alex {dot} rubinsteyn {at} mssm {dot} edu",
Expand Down
7 changes: 5 additions & 2 deletions test/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import functools

from pyensembl import (
ensembl_grch36,
ensembl_grch37,
ensembl_grch38,
cached_release
cached_release,
MAX_ENSEMBL_RELEASE,
)
from nose.tools import nottest

Expand All @@ -23,9 +23,12 @@ def test_ensembl_releases(*versions):
Run a unit test which takes an EnsemblRelease as an argument
for multiple releases (most recent for each reference genome)
"""

if len(versions) == 0:
ensembl_releases = major_releases
else:
if any(version > MAX_ENSEMBL_RELEASE for version in versions):
raise ValueError("Invalid ensembl release numbers: %s" % (versions,))
ensembl_releases = [cached_release(version) for version in versions]

def decorator(test_fn):
Expand Down
8 changes: 4 additions & 4 deletions test/test_ensembl_object_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
from __future__ import absolute_import

from nose.tools import eq_
from pyensembl import EnsemblRelease
from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE

def test_reference_name():
def test_human_reference_name():
eq_(EnsemblRelease(release=54).reference_name, "NCBI36")
eq_(EnsemblRelease(release=74).reference_name, "GRCh37")
eq_(EnsemblRelease(release=75).reference_name, "GRCh37")
eq_(EnsemblRelease(release=78).reference_name, "GRCh38")
eq_(EnsemblRelease(release=79).reference_name, "GRCh38")
for release in range(76, MAX_ENSEMBL_RELEASE):
eq_(EnsemblRelease(release=release).reference_name, "GRCh38")
13 changes: 7 additions & 6 deletions test/test_exon_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
"""
from __future__ import absolute_import

from pyensembl import ensembl_grch38 as ensembl
from pyensembl import cached_release

ensembl = cached_release(77)

# all exons associated with TP53 gene in Ensembl release 77
TP53_EXON_IDS_RELEASE_77 = [
Expand Down Expand Up @@ -85,9 +87,8 @@ def test_exon_ids_of_transcript_name():
len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77),
len(exon_ids))
assert all(
exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
for exon_id in exon_ids
)
exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
for exon_id in exon_ids)

def exon_ids_of_transcript_id():
"""
Expand All @@ -101,5 +102,5 @@ def exon_ids_of_transcript_id():
len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77),
len(exon_ids))
assert all(
exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
for exon_id in exon_ids)
exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
for exon_id in exon_ids)
27 changes: 24 additions & 3 deletions test/test_exon_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
"""
from __future__ import absolute_import

from pyensembl import ensembl_grch38 as ensembl
from pyensembl import cached_release

ensembl = cached_release(77)

def test_exon_object_by_id():
"""
Expand All @@ -25,8 +27,8 @@ def test_exon_object_by_id():

def test_exon_object_by_id_on_negative_strand():
"""
test_exon_object_by_id : check properties of exon 1 from CXCR3 when looked
up by ID in Ensembl 77.
test_exon_object_by_id_on_negative_strand : check properties of exon 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious why you're just repeating the name of the test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's to combat something annoying that nose does when you're running tests. If a test has a docstring then the name nose shows gets replaced by the docstring, which then obscures which test it's running.

from CXCR3 when looked up by ID in Ensembl 77.
"""
exon = ensembl.exon_by_id("ENSE00001817013")
assert exon.gene_name == "CXCR3", \
Expand Down Expand Up @@ -69,3 +71,22 @@ def test_exon_object_at_locus_on_negative_strand():
assert exon.on_negative_strand
assert exon.start <= 71618517, "Unexpected exon start: %s" % exon.start
assert exon.end >= 71618517, "Unexpected exon end: %s" % exon.end

def test_exon_basic_properties_str():
exon = ensembl.exon_by_id("ENSE00001817013")
assert isinstance(str(exon), str)
assert isinstance(repr(exon), str)
# for now we're assuming that __repr__ and __str__ do the same thing,
# if we later change that assumption we should do so explicitly and
# change this test
assert str(exon) == repr(exon), "%s != %s" % (str(exon), repr(exon))

def test_exon_basic_properties_hash():
exon = ensembl.exon_by_id("ENSE00001817013")
assert isinstance(hash(exon), int), \
"Hash function returns %s instead of int" % (
type(hash(exon),))
assert hash(exon) == hash(exon), "Hash function is non-deterministic!"
other_exon = ensembl.exon_by_id("ENSE00003464041")
assert exon != other_exon
assert hash(exon) != hash(other_exon)
3 changes: 2 additions & 1 deletion test/test_gene_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from __future__ import absolute_import

from nose.tools import assert_raises, ok_
from pyensembl import ensembl_grch38, ensembl77
from pyensembl import ensembl_grch38, cached_release

from .common import test_ensembl_releases

ensembl77 = cached_release(77, "human")

def test_gene_ids_grch38_hla_a():
# chr6:29,945,884 is a position for HLA-A
Expand Down
16 changes: 14 additions & 2 deletions test/test_gene_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@

from .common import test_ensembl_releases

# make sure that familia
KNOWN_GENE_NAMES = [
"TP53",
"ERBB2",
"SMAD4",
"CTAG1A",
"HLA-A",
]

@test_ensembl_releases()
def test_all_gene_names(ensembl):
"""
test_all_gene_names : Make sure some known gene names such as
SMAD4, HSP90AA1, TP53, ERBB2
SMAD4, TP53, ERBB2, &c
"""
gene_names = ensembl.gene_names()
print(type(gene_names))
Expand All @@ -46,3 +46,15 @@ def test_gene_names_on_contig(ensembl):
assert "SMAD4" in gene_names_chr18, \
"No SMAD4 in gene names on chr18 of %s, gene names: %s ... (%d)" % (
ensembl, list(gene_names_chr18[:4]), len(gene_names_chr18))


def test_gene_name_of_HLA_gene_id():
gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A")
gene_names = [
ensembl_grch38.gene_name_of_gene_id(gene_id)
for gene_id in gene_ids
]
unique_gene_names = list(set(gene_names))
assert len(unique_gene_names) == 1, (len(unique_gene_names), unique_gene_names)
gene_name = unique_gene_names[0]
assert gene_name == "HLA-A", gene_name
2 changes: 1 addition & 1 deletion test/test_gtf_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ def test_gtf_creates_csv_files_in_cache_dir():
# GTF parsing and then saving the parsed results in a csv file
gtf_object.dataframe()
assert len(glob(search_pattern)) > 0, \
"Expected GTF to save files in cache_dir"
"Expected GTF to save files in cache_dir"
2 changes: 1 addition & 1 deletion test/test_id_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def check_id_length(method_name):
idents = method(contig="Y")
assert len(idents) > 0, "No values returned by %s" % method_name
assert all(len(ident) == 15 for ident in idents), \
"Invalid IDs for %s: %s" % (
"Invalid IDs for %s: %s" % (
method_name,
[ident for ident in idents if len(ident) != 15])

Expand Down
13 changes: 10 additions & 3 deletions test/test_release_versions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import absolute_import

from pyensembl import EnsemblRelease
from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE

from nose.tools import raises

Expand All @@ -21,10 +21,17 @@ def test_version_is_not_numeric():
def test_version_is_none():
EnsemblRelease(None)

def test_max_ensembl_release():
assert isinstance(MAX_ENSEMBL_RELEASE, int), \
"Unexpected type for MAX_ENSEMBL_RELEASE: %s" % (
type(MAX_ENSEMBL_RELEASE),)
assert 83 <= MAX_ENSEMBL_RELEASE < 1000, \
"Unexpected value for MAX_ENSEMBL_RELEASE: %d" % MAX_ENSEMBL_RELEASE

def test_int_version():
for version in range(54, 81):
for version in range(54, MAX_ENSEMBL_RELEASE):
EnsemblRelease(version)

def test_str_version():
for version in range(54, 81):
for version in range(54, MAX_ENSEMBL_RELEASE):
EnsemblRelease(str(version))
Loading