Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving performance #10

Merged
merged 5 commits into from
Dec 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 18 additions & 0 deletions mhcgnomes/allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,24 @@ def __init__(
self.annotations = tuple(annotations)
self.mutations = tuple(mutations)

def __hash__(self):
return hash((
self.gene,
self.allele_fields,
self.annotations,
self.mutations
))

def __eq__(self, other):
if type(other) is not Allele:
return False
return (
self.gene == other.gene and
self.allele_fields == other.allele_fields and
self.annotations == other.annotations and
self.mutations == other.mutations
)

@classmethod
def tuple_field_names(cls):
return (
Expand Down
74 changes: 43 additions & 31 deletions mhcgnomes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from typing import Iterable

def unique(xs : Iterable):
Expand All @@ -28,15 +29,28 @@ def unique(xs : Iterable):
unique_set.add(xi)
return result

def arg_to_cache_key(x):
if type(x) in {list, tuple}:
value = tuple([arg_to_cache_key(xi) for xi in x])
elif type(x) is dict:
def arg_to_cache_key(x, _primitive_types={bool, int, str, float}):
if x is None:
return None

t = type(x)
if t is int or t is str or t is bool or t is float:
return x

if t is list or t is tuple:
if len(x) == 0:
value = ()
elif len(x) == 1:
value = (arg_to_cache_key(x[0]),)
else:
value = tuple([arg_to_cache_key(xi) for xi in x])
elif t is dict:
value = tuple([
(arg_to_cache_key(k), arg_to_cache_key(v)) for (k, v) in x.items()])
(arg_to_cache_key(k), arg_to_cache_key(v))
for (k, v) in x.items()])
else:
value = x
return (type(x).__name__, value)
return (t.__name__, value)

def cache(fn):
"""
Expand All @@ -46,44 +60,42 @@ def cache(fn):
"""
cache = {}
def cached_fn(*args, **kwargs):
args_key = arg_to_cache_key(args)
kwargs_key = arg_to_cache_key(kwargs)
if not args:
args_key = ()
else:
args_key = arg_to_cache_key(args)
if not kwargs:
kwargs_key = ()
else:
kwargs_key = arg_to_cache_key(kwargs)
key = (args_key, kwargs_key)
if key not in cache:

result = fn(*args, **kwargs)
cache[key] = result
return cache[key]
return cached_fn

def normalize_string(name, chars_to_remove="-_':"):
def normalize_string(name, _cache={}):
"""
Return uppercase string without any surrounding whitespace and
without any characters such as '-', '_' ':' or "'"
"""
if name is None:
return None
if type(name) in (float, int):
name = str(name)

if name in _cache:
return _cache[name]

if not isinstance(name, str):
return name

if " " in name:
name = name.strip()
name = name.upper()
for char in chars_to_remove:
if char in name:
name = name.replace(char, "")
return name


def normalize_dict_key(key):
if type(key) in (list, tuple):
return tuple([
normalize_dict_key(sub_key)
for sub_key in key
])
t = type(name)
if t is float or t is int:
result = str(name)
elif t is not str:
result = name
else:
return normalize_string(key)
result = (
name.replace("-", "").replace("_", "")
.replace("'", "").replace(":", "")
.strip().upper()
)
_cache[name] = result
return result
12 changes: 11 additions & 1 deletion mhcgnomes/gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,21 @@ def __init__(
raw_string=raw_string)
self.name = name

def __hash__(self):
return hash((self.species, self.name))

def __eq__(self, other):
if type(other) is not Gene:
return False
return (
self.species == other.species and
self.name == other.name
)

@property
def raw_string_was_alias(self):
return self.raw_string in self.species.gene_aliases


@property
def gene_name(self):
return self.name
Expand Down
73 changes: 38 additions & 35 deletions mhcgnomes/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ def __init__(
self.gene_seps = gene_seps
self.verbose = verbose

# technically we could just wrap the transform method with @cache
# but since it's called a lot it's faster to make a dedicated cache
# for a single input argument
self._transform_cache = {}

def parse_species_from_prefix(self, name: str):
"""
Returns tuple with two elements:
Expand Down Expand Up @@ -204,27 +209,6 @@ def get_serotype(
alleles=alleles,
raw_string=serotype_name)

def get_haplotype_with_class2_locus(
self,
species: Union[Species, str],
locus_string: str,
haplotype_string: str):
"""
Construct a haplotype limited at a specific Class II locus
Returns Haplotype or None
"""
locus = Class2Locus.get(species, locus_string)
if locus is None:
return None
haplotype = self.get_haplotype(
species,
haplotype_string)
if haplotype is None:
return None
return haplotype.restrict_class2_locus(
class2_locus=locus,
raise_on_error=False)

def parse_haplotype_with_class2_locus_from_any_string_split(
self,
species: Union[Species, str],
Expand All @@ -233,15 +217,24 @@ def parse_haplotype_with_class2_locus_from_any_string_split(
Try parsing a string like "IAk" into the 'k' mouse haplotype restricted
at the A locus
"""
for locus_length in range(1, len(locus_and_haplotype)):
haplotype_string = self.strip_extra_chars(
locus_and_haplotype[locus_length:])
for locus_length in range(
1,
len(locus_and_haplotype)):
locus_string = self.strip_extra_chars(
locus_and_haplotype[:locus_length])
haplotype = self.get_haplotype_with_class2_locus(
species=species,
locus_string=locus_string,
haplotype_string=haplotype_string)
locus = Class2Locus.get(species, locus_string)
if locus is None:
continue
haplotype_string = self.strip_extra_chars(
locus_and_haplotype[locus_length:])
haplotype = self.get_haplotype(
species,
haplotype_string)
if haplotype is None:
continue
haplotype = haplotype.restrict_class2_locus(
class2_locus=locus,
raise_on_error=False)
if haplotype:
return haplotype
return None
Expand Down Expand Up @@ -361,7 +354,7 @@ def parse_allele_from_allele_fields(
gene: Gene,
allele_fields: Union[str, Sequence[str], None],
functional_annotations: Union[str, Sequence[str], None] = None,
raw_string: Union[str, None] = None) -> Union[Allele, None]:
raw_string: Union[str, None] = None) -> Union[Gene, Allele, None]:
if allele_fields is None:
return None

Expand Down Expand Up @@ -876,14 +869,17 @@ def adjust_raw_strings(
results.append(parse_candidate)
return results

@cache
def transform_parse_candidate(self, parse_candidate : Result):
def transform_parse_candidate(
self,
parse_candidate: Result):
"""
Perform optional transformations on Result objects such as collapsing
singleton serotypes and haplotypes.
"""
if parse_candidate is None:
return None
if parse_candidate in self._transform_cache:
return self._transform_cache[parse_candidate]
t = type(parse_candidate)
transformed = None
if t in (Serotype, Haplotype):
Expand Down Expand Up @@ -965,9 +961,11 @@ def transform_parse_candidate(self, parse_candidate : Result):
print("In: %s" % parse_candidate)
print("Out: %s" % transformed)
if transformed is not None:
return transformed
result = transformed
else:
return parse_candidate
result = parse_candidate
self._transform_cache[parse_candidate] = result
return result

def transform_parse_candidates(
self,
Expand Down Expand Up @@ -1062,6 +1060,7 @@ def parse_single_token_to_multiple_candidates(
print("=== Functions without required species argument ===")
for fn in fns_without_species:
result = fn(seq, default_species=default_species)

if self.verbose:
print("%s('%s', default_species=%s) = %s" % (
fn.__qualname__,
Expand All @@ -1070,6 +1069,8 @@ def parse_single_token_to_multiple_candidates(
else default_species),
('%s' % result if type(result) is str else result)
))
if result is None:
continue
if type(result) in (list, tuple):
parse_candidates.extend(result)
elif isinstance(result, Result):
Expand Down Expand Up @@ -1110,7 +1111,7 @@ def parse_single_token_to_multiple_candidates(
seq,
"None" if not result else '%s' % result
))
if not result:
if result is None:
continue
if type(result) in (list, tuple):
parse_candidates.extend(result)
Expand Down Expand Up @@ -1545,5 +1546,7 @@ def parse(
if infer_class2_pairing:
result = infer_class2_alpha_chain(result)

return result.copy(raw_string=name)
if result.raw_string != name:
result = result.copy(raw_string=name)

return result
20 changes: 10 additions & 10 deletions mhcgnomes/parsing_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,20 @@
# limitations under the License.


def strip_char(s : str, char_to_remove : str):
while s.startswith(char_to_remove):
s = s[1:]
while s.endswith(char_to_remove):
s = s[:-1]
return s

def strip_chars(s : str, chars_to_remove):
def strip_chars(s : str, chars_to_remove, _cache={}):
if s in _cache:
return _cache[s]
original = s
for c in chars_to_remove:
s = strip_char(s, c)
while s and s[0] == c:
s = s[1:]
while s and s[-1] == c:
s = s[:-1]
_cache[original] = s
return s

def strip_whitespace_and_dashes(s : str):
return strip_chars(s, "- ").strip()
return strip_chars(s, "- ")

def smart_split(seq : str, sep : str):
"""
Expand Down
5 changes: 4 additions & 1 deletion mhcgnomes/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,10 @@ def __eq__(self, other):
return True

def __hash__(self):
return sum(hash(getattr(self, field)) for field in self.hash_field_names())
total = 0
for field in self.hash_field_names():
total += hash(getattr(self, field))
return total

def to_record(self):
raise NotImplementedError(
Expand Down