pirl-unc · iskandr · Dec 1, 2020 · Dec 1, 2020 · Dec 1, 2020 · Dec 1, 2020
diff --git a/mhcgnomes/allele.py b/mhcgnomes/allele.py
@@ -38,6 +38,24 @@ def __init__(
         self.annotations = tuple(annotations)
         self.mutations = tuple(mutations)
 
+    def __hash__(self):
+        return hash((
+            self.gene,
+            self.allele_fields,
+            self.annotations,
+            self.mutations
+        ))
+
+    def __eq__(self, other):
+        if type(other) is not Allele:
+            return False
+        return (
+            self.gene == other.gene and
+            self.allele_fields == other.allele_fields and
+            self.annotations == other.annotations and
+            self.mutations == other.mutations
+        )
+
     @classmethod
     def tuple_field_names(cls):
         return (

diff --git a/mhcgnomes/common.py b/mhcgnomes/common.py
@@ -10,6 +10,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
 from typing import Iterable
 
 def unique(xs : Iterable):
@@ -28,15 +29,28 @@ def unique(xs : Iterable):
         unique_set.add(xi)
     return result
 
-def arg_to_cache_key(x):
-    if type(x) in {list, tuple}:
-        value = tuple([arg_to_cache_key(xi) for xi in x])
-    elif type(x) is dict:
+def arg_to_cache_key(x, _primitive_types={bool, int, str, float}):
+    if x is None:
+        return None
+
+    t = type(x)
+    if t is int or t is str or t is bool or t is float:
+        return x
+
+    if t is list or t is tuple:
+        if len(x) == 0:
+            value = ()
+        elif len(x) == 1:
+            value = (arg_to_cache_key(x[0]),)
+        else:
+            value = tuple([arg_to_cache_key(xi) for xi in x])
+    elif t is dict:
         value = tuple([
-            (arg_to_cache_key(k), arg_to_cache_key(v)) for (k, v) in x.items()])
+            (arg_to_cache_key(k), arg_to_cache_key(v))
+            for (k, v) in x.items()])
     else:
         value = x
-    return (type(x).__name__, value)
+    return (t.__name__, value)
 
 def cache(fn):
     """
@@ -46,44 +60,42 @@ def cache(fn):
     """
     cache = {}
     def cached_fn(*args, **kwargs):
-        args_key = arg_to_cache_key(args)
-        kwargs_key = arg_to_cache_key(kwargs)
+        if not args:
+            args_key = ()
+        else:
+            args_key = arg_to_cache_key(args)
+        if not kwargs:
+            kwargs_key = ()
+        else:
+            kwargs_key = arg_to_cache_key(kwargs)
         key = (args_key, kwargs_key)
         if key not in cache:
-
             result = fn(*args, **kwargs)
             cache[key] = result
         return cache[key]
     return cached_fn
 
-def normalize_string(name, chars_to_remove="-_':"):
+def normalize_string(name, _cache={}):
     """
     Return uppercase string without any surrounding whitespace and
     without any characters such as '-', '_' ':' or "'"
     """
     if name is None:
         return None
-    if type(name) in (float, int):
-        name = str(name)
 
+    if name in _cache:
+        return _cache[name]
 
-    if not isinstance(name, str):
-        return name
-
-    if " " in name:
-        name = name.strip()
-    name = name.upper()
-    for char in chars_to_remove:
-        if char in name:
-            name = name.replace(char, "")
-    return name
-
-
-def normalize_dict_key(key):
-    if type(key) in (list, tuple):
-        return tuple([
-            normalize_dict_key(sub_key)
-            for sub_key in key
-        ])
+    t = type(name)
+    if t is float or t is int:
+        result = str(name)
+    elif t is not str:
+        result = name
     else:
-        return normalize_string(key)
+        result = (
+            name.replace("-", "").replace("_", "")
+                .replace("'", "").replace(":", "")
+                .strip().upper()
+        )
+    _cache[name] = result
+    return result
diff --git a/mhcgnomes/gene.py b/mhcgnomes/gene.py
@@ -32,11 +32,21 @@ def __init__(
             raw_string=raw_string)
         self.name = name
 
+    def __hash__(self):
+        return hash((self.species, self.name))
+
+    def __eq__(self, other):
+        if type(other) is not Gene:
+            return False
+        return (
+            self.species == other.species and
+            self.name == other.name
+        )
+
     @property
     def raw_string_was_alias(self):
         return self.raw_string in self.species.gene_aliases
 
-
     @property
     def gene_name(self):
         return self.name

diff --git a/mhcgnomes/parser.py b/mhcgnomes/parser.py
@@ -92,6 +92,11 @@ def __init__(
         self.gene_seps = gene_seps
         self.verbose = verbose
 
+        # technically we could just wrap the transform method with @cache
+        # but since it's called a lot it's faster to make a dedicated cache
+        # for a single input argument
+        self._transform_cache = {}
+
     def parse_species_from_prefix(self, name: str):
         """
         Returns tuple with two elements:
@@ -204,27 +209,6 @@ def get_serotype(
             alleles=alleles,
             raw_string=serotype_name)
 
-    def get_haplotype_with_class2_locus(
-            self,
-            species: Union[Species, str],
-            locus_string: str,
-            haplotype_string: str):
-        """
-        Construct a haplotype limited at a specific Class II locus
-        Returns Haplotype or None
-        """
-        locus = Class2Locus.get(species, locus_string)
-        if locus is None:
-            return None
-        haplotype = self.get_haplotype(
-            species,
-            haplotype_string)
-        if haplotype is None:
-            return None
-        return haplotype.restrict_class2_locus(
-            class2_locus=locus,
-            raise_on_error=False)
-
     def parse_haplotype_with_class2_locus_from_any_string_split(
             self,
             species: Union[Species, str],
@@ -233,15 +217,24 @@ def parse_haplotype_with_class2_locus_from_any_string_split(
          Try parsing a string like "IAk" into the 'k' mouse haplotype restricted
          at the A locus
          """
-        for locus_length in range(1, len(locus_and_haplotype)):
-            haplotype_string = self.strip_extra_chars(
-                locus_and_haplotype[locus_length:])
+        for locus_length in range(
+                1,
+                len(locus_and_haplotype)):
             locus_string = self.strip_extra_chars(
                 locus_and_haplotype[:locus_length])
-            haplotype = self.get_haplotype_with_class2_locus(
-                species=species,
-                locus_string=locus_string,
-                haplotype_string=haplotype_string)
+            locus = Class2Locus.get(species, locus_string)
+            if locus is None:
+                continue
+            haplotype_string = self.strip_extra_chars(
+                locus_and_haplotype[locus_length:])
+            haplotype = self.get_haplotype(
+                species,
+                haplotype_string)
+            if haplotype is None:
+                continue
+            haplotype = haplotype.restrict_class2_locus(
+                class2_locus=locus,
+                raise_on_error=False)
             if haplotype:
                 return haplotype
         return None
@@ -361,7 +354,7 @@ def parse_allele_from_allele_fields(
             gene: Gene,
             allele_fields: Union[str, Sequence[str], None],
             functional_annotations: Union[str, Sequence[str], None] = None,
-            raw_string: Union[str, None] = None) -> Union[Allele, None]:
+            raw_string: Union[str, None] = None) -> Union[Gene, Allele, None]:
         if allele_fields is None:
             return None
 
@@ -876,14 +869,17 @@ def adjust_raw_strings(
             results.append(parse_candidate)
         return results
 
-    @cache
-    def transform_parse_candidate(self, parse_candidate : Result):
+    def transform_parse_candidate(
+            self,
+            parse_candidate: Result):
         """
         Perform optional transformations on Result objects such as collapsing
         singleton serotypes and haplotypes.
         """
         if parse_candidate is None:
             return None
+        if parse_candidate in self._transform_cache:
+            return self._transform_cache[parse_candidate]
         t = type(parse_candidate)
         transformed = None
         if t in (Serotype, Haplotype):
@@ -965,9 +961,11 @@ def transform_parse_candidate(self, parse_candidate : Result):
             print("In:  %s" % parse_candidate)
             print("Out: %s" % transformed)
         if transformed is not None:
-            return transformed
+            result = transformed
         else:
-            return parse_candidate
+            result = parse_candidate
+        self._transform_cache[parse_candidate] = result
+        return result
 
     def transform_parse_candidates(
             self,
@@ -1062,6 +1060,7 @@ def parse_single_token_to_multiple_candidates(
             print("=== Functions without required species argument ===")
         for fn in fns_without_species:
             result = fn(seq, default_species=default_species)
+
             if self.verbose:
                 print("%s('%s', default_species=%s) = %s" % (
                     fn.__qualname__,
@@ -1070,6 +1069,8 @@ def parse_single_token_to_multiple_candidates(
                         else default_species),
                     ('%s' % result if type(result) is str else result)
                 ))
+            if result is None:
+                continue
             if type(result) in (list, tuple):
                 parse_candidates.extend(result)
             elif isinstance(result, Result):
@@ -1110,7 +1111,7 @@ def parse_single_token_to_multiple_candidates(
                             seq,
                             "None" if not result else '%s' % result
                         ))
-                    if not result:
+                    if result is None:
                         continue
                     if type(result) in (list, tuple):
                         parse_candidates.extend(result)
@@ -1545,5 +1546,7 @@ def parse(
         if infer_class2_pairing:
             result = infer_class2_alpha_chain(result)
 
-        return result.copy(raw_string=name)
+        if result.raw_string != name:
+            result = result.copy(raw_string=name)
 
+        return result
diff --git a/mhcgnomes/parsing_helpers.py b/mhcgnomes/parsing_helpers.py
@@ -11,20 +11,20 @@
 # limitations under the License.
 
 
-def strip_char(s : str, char_to_remove : str):
-    while s.startswith(char_to_remove):
-        s = s[1:]
-    while s.endswith(char_to_remove):
-        s = s[:-1]
-    return s
-
-def strip_chars(s : str, chars_to_remove):
+def strip_chars(s : str, chars_to_remove, _cache={}):
+    if s in _cache:
+        return _cache[s]
+    original = s
     for c in chars_to_remove:
-        s = strip_char(s, c)
+        while s and s[0] == c:
+            s = s[1:]
+        while s and s[-1] == c:
+            s = s[:-1]
+    _cache[original] = s
     return s
 
 def strip_whitespace_and_dashes(s : str):
-    return strip_chars(s, "- ").strip()
+    return strip_chars(s, "- ")
 
 def smart_split(seq : str, sep : str):
     """

diff --git a/mhcgnomes/result.py b/mhcgnomes/result.py
@@ -142,7 +142,10 @@ def __eq__(self, other):
         return True
 
     def __hash__(self):
-        return sum(hash(getattr(self, field)) for field in self.hash_field_names())
+        total = 0
+        for field in self.hash_field_names():
+            total += hash(getattr(self, field))
+        return total
 
     def to_record(self):
         raise NotImplementedError(