From cc543a35740efda39fc9a6a0594c870f0bcdd88e Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 14 Sep 2016 17:58:35 -0400 Subject: [PATCH 1/5] moved cached_release onto EnsemblRelease object, using cached constructor for deserialization, added test for uniqueness of instances --- pyensembl/__init__.py | 23 +++++++--------- pyensembl/ensembl_release.py | 52 +++++++++++++++++++++++++++++++----- test/test_serialization.py | 8 +++++- 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py index 71a1ec5..f4d110d 100644 --- a/pyensembl/__init__.py +++ b/pyensembl/__init__.py @@ -17,7 +17,7 @@ from .memory_cache import MemoryCache from .download_cache import DownloadCache from .ensembl_release import EnsemblRelease -from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE +from .ensembl_release_versions import MAX_ENSEMBL_RELEASE from .exon import Exon from .genome import Genome from .gene import Gene @@ -37,26 +37,21 @@ __version__ = '0.9.6' -_cache = {} +def cached_release(release, species="human"): + """ + Create an EnsemblRelease instance only if it's hasn't already been made, + otherwise returns the old instance. -def cached_release(version, species="human"): - """Cached construction of EnsemblRelease objects. It's desirable to reuse - the same EnsemblRelease object since each one will store a lot of cached - annotation data in-memory. + Keeping this function for backwards compatibility but this functionality + has been moving onto the EnsemblRelease object itself by overriding __new__. """ - version = check_release_number(version) - species = check_species_object(species) - key = (version, species) - if key not in _cache: - ensembl = EnsemblRelease(version, species=species) - _cache[key] = ensembl - return _cache[key] + return EnsemblRelease.cached(release=release, species=species) def genome_for_reference_name(reference_name): reference_name = normalize_reference_name(reference_name) species = find_species_by_reference(reference_name) (_, max_ensembl_release) = species.reference_assemblies[reference_name] - return cached_release(max_ensembl_release, species=species) + return cached_release(release=max_ensembl_release, species=species) ensembl_grch36 = ensembl54 = cached_release(54) # last release for GRCh36/hg18 ensembl_grch37 = ensembl75 = cached_release(75) # last release for GRCh37/hg19 diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index c6fe5f9..c68d43c 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -16,6 +16,7 @@ Contains the EnsemblRelease class, which extends the Genome class to be specific to (a particular release of) Ensembl. """ +from weakref import WeakValueDictionary from .genome import Genome from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE @@ -32,12 +33,44 @@ class EnsemblRelease(Genome): Bundles together the genomic annotation and sequence data associated with a particular release of the Ensembl database. """ - def __init__(self, - release=MAX_ENSEMBL_RELEASE, - species=human, - server=ENSEMBL_FTP_SERVER): - self.release = check_release_number(release) - self.species = check_species_object(species) + + @classmethod + def normalize_init_values(cls, release, species, server): + """ + Normalizes the arguments which uniquely specify an EnsemblRelease + genome. + """ + release = check_release_number(release) + species = check_species_object(species) + return (release, species, server) + + _genome_cache = WeakValueDictionary() + + @classmethod + def cached( + cls, + release=MAX_ENSEMBL_RELEASE, + species=human, + server=ENSEMBL_FTP_SERVER): + """ + Construct EnsemblRelease if it's never been made before, otherwise + return an old instance. + """ + init_args_tuple = cls.normalize_init_values(release, species, server) + if init_args_tuple in cls._genome_cache: + genome = cls._genome_cache[init_args_tuple] + else: + genome = cls._genome_cache[init_args_tuple] = cls(*init_args_tuple) + return genome + + def __init__( + self, + release=MAX_ENSEMBL_RELEASE, + species=human, + server=ENSEMBL_FTP_SERVER): + self.release, self.species, self.server = self.normalize_init_values( + release=release, species=species, server=server) + self.species = species self.server = server self.gtf_url = make_gtf_url( @@ -92,3 +125,10 @@ def to_dict(self): "species": self.species, "server": self.server } + + @classmethod + def from_dict(cls, state_dict): + """ + Deserialize EnsemblRelease without creating duplicate instances. + """ + return cls.cached(**state_dict) diff --git a/test/test_serialization.py b/test/test_serialization.py index 8251fc8..a2bd13d 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -25,7 +25,6 @@ setup_init_custom_mouse_genome ) - @test_ensembl_releases() def test_pickle_ensembl_gene(ensembl_genome): gene = ensembl_genome.gene_by_id(TP53_gene_id) @@ -112,3 +111,10 @@ def test_species_to_json(): def test_species_to_pickle(): eq_(human, pickle.loads(pickle.dumps(human))) + + +@test_ensembl_releases() +def test_unique_memory_address_of_unpickled_genomes(ensembl_genome): + unpickled = pickle.loads(pickle.dumps(ensembl_genome)) + assert ensembl_genome is unpickled, \ + "Expected same object for %s but got two different instances" % (unpickled,) From e99a0c5bc9d920b5d1780e81d7ed8d61f0737384 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 14 Sep 2016 18:00:02 -0400 Subject: [PATCH 2/5] added comment explaining use of WeakValueDictionary --- pyensembl/ensembl_release.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index c68d43c..35bd985 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -44,6 +44,10 @@ def normalize_init_values(cls, release, species, server): species = check_species_object(species) return (release, species, server) + # Using a WeakValueDictionary instead of an ordinary dict to prevent a + # memory leak in cases where we test many different releases in sequence. + # When all the references to a particular EnsemblRelease die then that + # genome should also be removed from this cache. _genome_cache = WeakValueDictionary() @classmethod From 9f426926a0db8229cb735725cb4d41959d24890d Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 14 Sep 2016 18:07:47 -0400 Subject: [PATCH 3/5] always use self.species after normalization --- pyensembl/ensembl_release.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index 35bd985..20a8967 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -74,13 +74,12 @@ def __init__( server=ENSEMBL_FTP_SERVER): self.release, self.species, self.server = self.normalize_init_values( release=release, species=species, server=server) - self.species = species - self.server = server self.gtf_url = make_gtf_url( ensembl_release=self.release, - species=species, - server=server) + species=self.species, + server=self.server) + self.transcript_fasta_url = make_fasta_url( ensembl_release=self.release, species=self.species.latin_name, @@ -90,7 +89,7 @@ def __init__( ensembl_release=self.release, species=self.species.latin_name, sequence_type="pep", - server=server) + server=self.server) self.reference_name = self.species.which_reference(self.release) From d697ece0ab89c93b23f790d6838b69a9496b2bff Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 14 Sep 2016 18:11:02 -0400 Subject: [PATCH 4/5] version bump --- pyensembl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py index f4d110d..14e5245 100644 --- a/pyensembl/__init__.py +++ b/pyensembl/__init__.py @@ -35,7 +35,7 @@ ) from .transcript import Transcript -__version__ = '0.9.6' +__version__ = '0.9.7' def cached_release(release, species="human"): """ From 0fa961d6e413ed30f1b9506bb8415e25683401fb Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 14 Sep 2016 18:28:11 -0400 Subject: [PATCH 5/5] fixed comment, not touching __new__ --- pyensembl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py index 14e5245..c019683 100644 --- a/pyensembl/__init__.py +++ b/pyensembl/__init__.py @@ -43,7 +43,7 @@ def cached_release(release, species="human"): otherwise returns the old instance. Keeping this function for backwards compatibility but this functionality - has been moving onto the EnsemblRelease object itself by overriding __new__. + has been moving into the cached method of EnsemblRelease. """ return EnsemblRelease.cached(release=release, species=species)