Skip to content

Commit

Permalink
BUG: Reduce memory overhead in simstring db
Browse files Browse the repository at this point in the history
Closes #52
  • Loading branch information
ghisvail committed Jun 12, 2024
1 parent 2d4becb commit b59d565
Showing 1 changed file with 14 additions and 23 deletions.
37 changes: 14 additions & 23 deletions medkit/text/ner/_base_simstring_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"build_simstring_matcher_databases",
]

import collections
import dataclasses
import math
import re
Expand Down Expand Up @@ -388,33 +389,23 @@ def build_simstring_matcher_databases(
"""
# the params passed to simstring.writer are copy/pasted from QuickUMLS
# cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/a3ba0b3559da2574a907f4d41aa0f2c1c0d5ce0a/quickumls/toolbox.py#L173
simstring_db_writer = simstring.writer(
str(simstring_db_file),
3, # unit of character n-grams
False, # represent begin and end of strings in n-grams
True, # use unicode mode
)

# writeback=True needed because we are updating the values in the mapping,
# not just writing
rules_db = shelve.open(str(rules_db_file), flag="n", writeback=True) # noqa: S301

# add rules to databases
simstring_db_writer = simstring.writer(str(simstring_db_file), n=3, be=False, unicode=True)

# Prepare rules mapping for persistence, as:
# term -> list of rules
rules_mapping = collections.defaultdict(list)
for rule in rules:
term_to_match = rule.term
term = anyascii(rule.term.lower())
rules_mapping[term].append(rule)

# apply preprocessing
term_to_match = anyascii(term_to_match.lower())
# Persist rules mapping in new shelf.
with shelve.open(str(rules_db_file), flag="n") as rules_db: # noqa: S301
rules_db.update(rules_mapping)

# add to simstring db
simstring_db_writer.insert(term_to_match)
# add to rules db
if term_to_match not in rules_db:
rules_db[term_to_match] = []
rules_db[term_to_match].append(rule)
# Update simstring db with terms in rules mapping.
for term in rules_mapping:
simstring_db_writer.insert(term)
simstring_db_writer.close()
rules_db.sync()
rules_db.close()


_TOKENIZATION_PATTERN = re.compile(r"[\w]+|[^\w ]")
Expand Down

0 comments on commit b59d565

Please sign in to comment.