BUG: Reduce memory overhead in simstring db

Closes #52
medkit-lib · Jun 12, 2024 · b59d565 · b59d565
1 parent 2d4becb
commit b59d565
Showing 1 changed file with 14 additions and 23 deletions.
diff --git a/medkit/text/ner/_base_simstring_matcher.py b/medkit/text/ner/_base_simstring_matcher.py
@@ -7,6 +7,7 @@
     "build_simstring_matcher_databases",
 ]
 
+import collections
 import dataclasses
 import math
 import re
@@ -388,33 +389,23 @@ def build_simstring_matcher_databases(
     """
     # the params passed to simstring.writer are copy/pasted from QuickUMLS
     # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/a3ba0b3559da2574a907f4d41aa0f2c1c0d5ce0a/quickumls/toolbox.py#L173
-    simstring_db_writer = simstring.writer(
-        str(simstring_db_file),
-        3,  # unit of character n-grams
-        False,  # represent begin and end of strings in n-grams
-        True,  # use unicode mode
-    )
-
-    # writeback=True needed because we are updating the values in the mapping,
-    # not just writing
-    rules_db = shelve.open(str(rules_db_file), flag="n", writeback=True)  # noqa: S301
-
-    # add rules to databases
+    simstring_db_writer = simstring.writer(str(simstring_db_file), n=3, be=False, unicode=True)
+
+    # Prepare rules mapping for persistence, as:
+    # term -> list of rules
+    rules_mapping = collections.defaultdict(list)
     for rule in rules:
-        term_to_match = rule.term
+        term = anyascii(rule.term.lower())
+        rules_mapping[term].append(rule)
 
-        # apply preprocessing
-        term_to_match = anyascii(term_to_match.lower())
+    # Persist rules mapping in new shelf.
+    with shelve.open(str(rules_db_file), flag="n") as rules_db:  # noqa: S301
+        rules_db.update(rules_mapping)
 
-        # add to simstring db
-        simstring_db_writer.insert(term_to_match)
-        # add to rules db
-        if term_to_match not in rules_db:
-            rules_db[term_to_match] = []
-        rules_db[term_to_match].append(rule)
+    # Update simstring db with terms in rules mapping.
+    for term in rules_mapping:
+        simstring_db_writer.insert(term)
     simstring_db_writer.close()
-    rules_db.sync()
-    rules_db.close()
 
 
 _TOKENIZATION_PATTERN = re.compile(r"[\w]+|[^\w ]")