Skip to content

Commit

Permalink
Merge pull request #178 from nielstron/fix_capitalization_common_words
Browse files Browse the repository at this point in the history
Fix recognition of weirdly capitalized common words
  • Loading branch information
nielstron committed Oct 20, 2020
2 parents 39da285 + d937008 commit 963ccc2
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 14 deletions.
3 changes: 3 additions & 0 deletions quantulum3/_lang/en_US/common-units.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# A list containing correct unit combinations that are often mistakenly marked as common words
# it is supposed to be continously improved
js
7 changes: 6 additions & 1 deletion quantulum3/_lang/en_US/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def number_to_words(number):
###############################################################################
def build_common_words():
# Read raw 4 letter file
path = os.path.join(TOPDIR, "common-units.txt")
with open(path, "r", encoding="utf-8") as file:
common_units = {line.strip() for line in file if not line.startswith("#")}
path = os.path.join(TOPDIR, "common-words.txt")
words = defaultdict(list) # Collect words based on length
with open(path, "r", encoding="utf-8") as file:
Expand All @@ -39,14 +42,16 @@ def build_common_words():
continue
line = line.rstrip()
if (
line not in load.units(lang).surfaces_all
line not in load.units(lang).surfaces_lower
and line not in load.units(lang).symbols
and line not in common_units
):
words[len(line)].append(line)
plural = load.pluralize(line)
if (
plural not in load.units(lang).surfaces_all
and plural not in load.units(lang).symbols
and plural not in common_units
):
words[len(plural)].append(plural)
return words
Expand Down
15 changes: 2 additions & 13 deletions quantulum3/_lang/en_US/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,19 +272,8 @@ def build_quantity(orig_text, text, item, values, unit, surface, span, uncert):
# Combination has to be at least one letter
if len(combination) < 1:
continue
# Combination has to be all lower or capitalized in the first
# or all letters
if not (
combination.islower()
or (
len(combination) > 2
and (
(combination[0].isupper() and combination[1:].islower())
or combination.isupper()
)
)
):
continue
# Combination may have any capitalization due to possible common names
# i.e. PayPal, iPhone, LaTeX
# Combination has to be inside the surface
if combination not in surface:
continue
Expand Down
4 changes: 4 additions & 0 deletions quantulum3/_lang/en_US/tests/quantities.json
Original file line number Diff line number Diff line change
Expand Up @@ -1347,5 +1347,9 @@
"uncertainty": null
}
]
},
{
"req": "I sent some money using a PayPal account",
"res": []
}
]

0 comments on commit 963ccc2

Please sign in to comment.