Skip to content

Commit

Permalink
Merge pull request #10 from til-unc/improving-performance
Browse files Browse the repository at this point in the history
Improving performance
  • Loading branch information
iskandr committed Dec 1, 2020
2 parents 8b4021f + 8817d4f commit 7878e67
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 98 deletions.
18 changes: 18 additions & 0 deletions mhcgnomes/allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,24 @@ def __init__(
self.annotations = tuple(annotations)
self.mutations = tuple(mutations)

def __hash__(self):
return hash((
self.gene,
self.allele_fields,
self.annotations,
self.mutations
))

def __eq__(self, other):
if type(other) is not Allele:
return False
return (
self.gene == other.gene and
self.allele_fields == other.allele_fields and
self.annotations == other.annotations and
self.mutations == other.mutations
)

@classmethod
def tuple_field_names(cls):
return (
Expand Down
74 changes: 43 additions & 31 deletions mhcgnomes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from typing import Iterable

def unique(xs : Iterable):
Expand All @@ -28,15 +29,28 @@ def unique(xs : Iterable):
unique_set.add(xi)
return result

def arg_to_cache_key(x):
if type(x) in {list, tuple}:
value = tuple([arg_to_cache_key(xi) for xi in x])
elif type(x) is dict:
def arg_to_cache_key(x, _primitive_types={bool, int, str, float}):
if x is None:
return None

t = type(x)
if t is int or t is str or t is bool or t is float:
return x

if t is list or t is tuple:
if len(x) == 0:
value = ()
elif len(x) == 1:
value = (arg_to_cache_key(x[0]),)
else:
value = tuple([arg_to_cache_key(xi) for xi in x])
elif t is dict:
value = tuple([
(arg_to_cache_key(k), arg_to_cache_key(v)) for (k, v) in x.items()])
(arg_to_cache_key(k), arg_to_cache_key(v))
for (k, v) in x.items()])
else:
value = x
return (type(x).__name__, value)
return (t.__name__, value)

def cache(fn):
"""
Expand All @@ -46,44 +60,42 @@ def cache(fn):
"""
cache = {}
def cached_fn(*args, **kwargs):
args_key = arg_to_cache_key(args)
kwargs_key = arg_to_cache_key(kwargs)
if not args:
args_key = ()
else:
args_key = arg_to_cache_key(args)
if not kwargs:
kwargs_key = ()
else:
kwargs_key = arg_to_cache_key(kwargs)
key = (args_key, kwargs_key)
if key not in cache:

result = fn(*args, **kwargs)
cache[key] = result
return cache[key]
return cached_fn

def normalize_string(name, chars_to_remove="-_':"):
def normalize_string(name, _cache={}):
"""
Return uppercase string without any surrounding whitespace and
without any characters such as '-', '_' ':' or "'"
"""
if name is None:
return None
if type(name) in (float, int):
name = str(name)

if name in _cache:
return _cache[name]

if not isinstance(name, str):
return name

if " " in name:
name = name.strip()
name = name.upper()
for char in chars_to_remove:
if char in name:
name = name.replace(char, "")
return name


def normalize_dict_key(key):
if type(key) in (list, tuple):
return tuple([
normalize_dict_key(sub_key)
for sub_key in key
])
t = type(name)
if t is float or t is int:
result = str(name)
elif t is not str:
result = name
else:
return normalize_string(key)
result = (
name.replace("-", "").replace("_", "")
.replace("'", "").replace(":", "")
.strip().upper()
)
_cache[name] = result
return result
12 changes: 11 additions & 1 deletion mhcgnomes/gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,21 @@ def __init__(
raw_string=raw_string)
self.name = name

def __hash__(self):
return hash((self.species, self.name))

def __eq__(self, other):
if type(other) is not Gene:
return False
return (
self.species == other.species and
self.name == other.name
)

@property
def raw_string_was_alias(self):
return self.raw_string in self.species.gene_aliases


@property
def gene_name(self):
return self.name
Expand Down
73 changes: 38 additions & 35 deletions mhcgnomes/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ def __init__(
self.gene_seps = gene_seps
self.verbose = verbose

# technically we could just wrap the transform method with @cache
# but since it's called a lot it's faster to make a dedicated cache
# for a single input argument
self._transform_cache = {}

def parse_species_from_prefix(self, name: str):
"""
Returns tuple with two elements:
Expand Down Expand Up @@ -204,27 +209,6 @@ def get_serotype(
alleles=alleles,
raw_string=serotype_name)

def get_haplotype_with_class2_locus(
self,
species: Union[Species, str],
locus_string: str,
haplotype_string: str):
"""
Construct a haplotype limited at a specific Class II locus
Returns Haplotype or None
"""
locus = Class2Locus.get(species, locus_string)
if locus is None:
return None
haplotype = self.get_haplotype(
species,
haplotype_string)
if haplotype is None:
return None
return haplotype.restrict_class2_locus(
class2_locus=locus,
raise_on_error=False)

def parse_haplotype_with_class2_locus_from_any_string_split(
self,
species: Union[Species, str],
Expand All @@ -233,15 +217,24 @@ def parse_haplotype_with_class2_locus_from_any_string_split(
Try parsing a string like "IAk" into the 'k' mouse haplotype restricted
at the A locus
"""
for locus_length in range(1, len(locus_and_haplotype)):
haplotype_string = self.strip_extra_chars(
locus_and_haplotype[locus_length:])
for locus_length in range(
1,
len(locus_and_haplotype)):
locus_string = self.strip_extra_chars(
locus_and_haplotype[:locus_length])
haplotype = self.get_haplotype_with_class2_locus(
species=species,
locus_string=locus_string,
haplotype_string=haplotype_string)
locus = Class2Locus.get(species, locus_string)
if locus is None:
continue
haplotype_string = self.strip_extra_chars(
locus_and_haplotype[locus_length:])
haplotype = self.get_haplotype(
species,
haplotype_string)
if haplotype is None:
continue
haplotype = haplotype.restrict_class2_locus(
class2_locus=locus,
raise_on_error=False)
if haplotype:
return haplotype
return None
Expand Down Expand Up @@ -361,7 +354,7 @@ def parse_allele_from_allele_fields(
gene: Gene,
allele_fields: Union[str, Sequence[str], None],
functional_annotations: Union[str, Sequence[str], None] = None,
raw_string: Union[str, None] = None) -> Union[Allele, None]:
raw_string: Union[str, None] = None) -> Union[Gene, Allele, None]:
if allele_fields is None:
return None

Expand Down Expand Up @@ -876,14 +869,17 @@ def adjust_raw_strings(
results.append(parse_candidate)
return results

@cache
def transform_parse_candidate(self, parse_candidate : Result):
def transform_parse_candidate(
self,
parse_candidate: Result):
"""
Perform optional transformations on Result objects such as collapsing
singleton serotypes and haplotypes.
"""
if parse_candidate is None:
return None
if parse_candidate in self._transform_cache:
return self._transform_cache[parse_candidate]
t = type(parse_candidate)
transformed = None
if t in (Serotype, Haplotype):
Expand Down Expand Up @@ -965,9 +961,11 @@ def transform_parse_candidate(self, parse_candidate : Result):
print("In: %s" % parse_candidate)
print("Out: %s" % transformed)
if transformed is not None:
return transformed
result = transformed
else:
return parse_candidate
result = parse_candidate
self._transform_cache[parse_candidate] = result
return result

def transform_parse_candidates(
self,
Expand Down Expand Up @@ -1062,6 +1060,7 @@ def parse_single_token_to_multiple_candidates(
print("=== Functions without required species argument ===")
for fn in fns_without_species:
result = fn(seq, default_species=default_species)

if self.verbose:
print("%s('%s', default_species=%s) = %s" % (
fn.__qualname__,
Expand All @@ -1070,6 +1069,8 @@ def parse_single_token_to_multiple_candidates(
else default_species),
('%s' % result if type(result) is str else result)
))
if result is None:
continue
if type(result) in (list, tuple):
parse_candidates.extend(result)
elif isinstance(result, Result):
Expand Down Expand Up @@ -1110,7 +1111,7 @@ def parse_single_token_to_multiple_candidates(
seq,
"None" if not result else '%s' % result
))
if not result:
if result is None:
continue
if type(result) in (list, tuple):
parse_candidates.extend(result)
Expand Down Expand Up @@ -1545,5 +1546,7 @@ def parse(
if infer_class2_pairing:
result = infer_class2_alpha_chain(result)

return result.copy(raw_string=name)
if result.raw_string != name:
result = result.copy(raw_string=name)

return result
20 changes: 10 additions & 10 deletions mhcgnomes/parsing_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,20 @@
# limitations under the License.


def strip_char(s : str, char_to_remove : str):
while s.startswith(char_to_remove):
s = s[1:]
while s.endswith(char_to_remove):
s = s[:-1]
return s

def strip_chars(s : str, chars_to_remove):
def strip_chars(s : str, chars_to_remove, _cache={}):
if s in _cache:
return _cache[s]
original = s
for c in chars_to_remove:
s = strip_char(s, c)
while s and s[0] == c:
s = s[1:]
while s and s[-1] == c:
s = s[:-1]
_cache[original] = s
return s

def strip_whitespace_and_dashes(s : str):
return strip_chars(s, "- ").strip()
return strip_chars(s, "- ")

def smart_split(seq : str, sep : str):
"""
Expand Down
5 changes: 4 additions & 1 deletion mhcgnomes/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,10 @@ def __eq__(self, other):
return True

def __hash__(self):
return sum(hash(getattr(self, field)) for field in self.hash_field_names())
total = 0
for field in self.hash_field_names():
total += hash(getattr(self, field))
return total

def to_record(self):
raise NotImplementedError(
Expand Down

0 comments on commit 7878e67

Please sign in to comment.