Skip to content

Commit

Permalink
Merge pull request #163 from hammerlab/use-cached-ensembl-release-obj…
Browse files Browse the repository at this point in the history
…ects-after-pickling

Deserialization should construct cached EnsemblRelease objects
  • Loading branch information
iskandr committed Sep 14, 2016
2 parents 423af3c + 0fa961d commit 6fbf8ad
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 26 deletions.
25 changes: 10 additions & 15 deletions pyensembl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .memory_cache import MemoryCache
from .download_cache import DownloadCache
from .ensembl_release import EnsemblRelease
from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE
from .ensembl_release_versions import MAX_ENSEMBL_RELEASE
from .exon import Exon
from .genome import Genome
from .gene import Gene
Expand All @@ -35,28 +35,23 @@
)
from .transcript import Transcript

__version__ = '0.9.6'
__version__ = '0.9.7'

_cache = {}
def cached_release(release, species="human"):
"""
Create an EnsemblRelease instance only if it's hasn't already been made,
otherwise returns the old instance.
def cached_release(version, species="human"):
"""Cached construction of EnsemblRelease objects. It's desirable to reuse
the same EnsemblRelease object since each one will store a lot of cached
annotation data in-memory.
Keeping this function for backwards compatibility but this functionality
has been moving into the cached method of EnsemblRelease.
"""
version = check_release_number(version)
species = check_species_object(species)
key = (version, species)
if key not in _cache:
ensembl = EnsemblRelease(version, species=species)
_cache[key] = ensembl
return _cache[key]
return EnsemblRelease.cached(release=release, species=species)

def genome_for_reference_name(reference_name):
reference_name = normalize_reference_name(reference_name)
species = find_species_by_reference(reference_name)
(_, max_ensembl_release) = species.reference_assemblies[reference_name]
return cached_release(max_ensembl_release, species=species)
return cached_release(release=max_ensembl_release, species=species)

ensembl_grch36 = ensembl54 = cached_release(54) # last release for GRCh36/hg18
ensembl_grch37 = ensembl75 = cached_release(75) # last release for GRCh37/hg19
Expand Down
63 changes: 53 additions & 10 deletions pyensembl/ensembl_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Contains the EnsemblRelease class, which extends the Genome class
to be specific to (a particular release of) Ensembl.
"""
from weakref import WeakValueDictionary

from .genome import Genome
from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE
Expand All @@ -32,18 +33,53 @@ class EnsemblRelease(Genome):
Bundles together the genomic annotation and sequence data associated with
a particular release of the Ensembl database.
"""
def __init__(self,
release=MAX_ENSEMBL_RELEASE,
species=human,
server=ENSEMBL_FTP_SERVER):
self.release = check_release_number(release)
self.species = check_species_object(species)
self.server = server

@classmethod
def normalize_init_values(cls, release, species, server):
"""
Normalizes the arguments which uniquely specify an EnsemblRelease
genome.
"""
release = check_release_number(release)
species = check_species_object(species)
return (release, species, server)

# Using a WeakValueDictionary instead of an ordinary dict to prevent a
# memory leak in cases where we test many different releases in sequence.
# When all the references to a particular EnsemblRelease die then that
# genome should also be removed from this cache.
_genome_cache = WeakValueDictionary()

@classmethod
def cached(
cls,
release=MAX_ENSEMBL_RELEASE,
species=human,
server=ENSEMBL_FTP_SERVER):
"""
Construct EnsemblRelease if it's never been made before, otherwise
return an old instance.
"""
init_args_tuple = cls.normalize_init_values(release, species, server)
if init_args_tuple in cls._genome_cache:
genome = cls._genome_cache[init_args_tuple]
else:
genome = cls._genome_cache[init_args_tuple] = cls(*init_args_tuple)
return genome

def __init__(
self,
release=MAX_ENSEMBL_RELEASE,
species=human,
server=ENSEMBL_FTP_SERVER):
self.release, self.species, self.server = self.normalize_init_values(
release=release, species=species, server=server)

self.gtf_url = make_gtf_url(
ensembl_release=self.release,
species=species,
server=server)
species=self.species,
server=self.server)

self.transcript_fasta_url = make_fasta_url(
ensembl_release=self.release,
species=self.species.latin_name,
Expand All @@ -53,7 +89,7 @@ def __init__(self,
ensembl_release=self.release,
species=self.species.latin_name,
sequence_type="pep",
server=server)
server=self.server)

self.reference_name = self.species.which_reference(self.release)

Expand Down Expand Up @@ -92,3 +128,10 @@ def to_dict(self):
"species": self.species,
"server": self.server
}

@classmethod
def from_dict(cls, state_dict):
"""
Deserialize EnsemblRelease without creating duplicate instances.
"""
return cls.cached(**state_dict)
8 changes: 7 additions & 1 deletion test/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
setup_init_custom_mouse_genome
)


@test_ensembl_releases()
def test_pickle_ensembl_gene(ensembl_genome):
gene = ensembl_genome.gene_by_id(TP53_gene_id)
Expand Down Expand Up @@ -112,3 +111,10 @@ def test_species_to_json():

def test_species_to_pickle():
eq_(human, pickle.loads(pickle.dumps(human)))


@test_ensembl_releases()
def test_unique_memory_address_of_unpickled_genomes(ensembl_genome):
unpickled = pickle.loads(pickle.dumps(ensembl_genome))
assert ensembl_genome is unpickled, \
"Expected same object for %s but got two different instances" % (unpickled,)

0 comments on commit 6fbf8ad

Please sign in to comment.