Skip to content

Commit

Permalink
Merge 281e869 into 963ccc2
Browse files Browse the repository at this point in the history
  • Loading branch information
nielstron committed Oct 24, 2020
2 parents 963ccc2 + 281e869 commit 99278b4
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 15 deletions.
17 changes: 9 additions & 8 deletions quantulum3/_lang/en_US/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,26 +62,27 @@ def extract_spellout_values(text):
"""

values = []
try:
for item in reg.text_pattern_reg(lang).finditer(text):
for item in reg.text_pattern_reg(lang).finditer(text):
try:
surface, span = clean_surface(item.group(0), item.span())
if not surface or surface.lower() in reg.scales(lang):
continue
curr = result = 0.0
for word in surface.split():
for word in surface.lower().split():
try:
scale, increment = (
1,
float(
re.sub(
r"(-$|[%s])" % reg.grouping_operators_regex(lang),
"",
word.lower(),
word,
)
),
)
except ValueError:
scale, increment = reg.numberwords(lang)[word.lower()]
match = re.search(reg.numberwords_regex(), word)
scale, increment = reg.numberwords(lang)[match.group(0)]
curr = curr * scale + increment
if scale > 100:
result += curr
Expand All @@ -93,9 +94,9 @@ def extract_spellout_values(text):
"new_surface": str(result + curr),
}
)
except KeyError:
# just ignore the match if an error occurred
values = []
except (KeyError, AttributeError):
# just ignore the match if an error occurred
values = []

return sorted(values, key=lambda x: x["old_span"][0])

Expand Down
9 changes: 3 additions & 6 deletions quantulum3/_lang/en_US/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
"ninth": 1 / 9,
}

MISCNUM = {"and": (1, 0), "a": (1, 1), "an": (1, 1)}
MISCNUM = {"&": (1, 0), "and": (1, 0), "a": (1, 1), "an": (1, 1)}

###############################################################################

Expand All @@ -73,11 +73,8 @@
(?<![a-zA-Z0-9+.-]) # lookbehind, avoid "Area51"
{number_pattern_no_groups}
)?
[ -]?(?:{numberwords_regex})
[ -]?(?:{numberwords_regex})?
[ -]?(?:{numberwords_regex})?[ -]?(?:{numberwords_regex})?
[ -]?(?:{numberwords_regex})?[ -]?(?:{numberwords_regex})?
[ -]?(?:{numberwords_regex})?
[, ]?(?:{numberwords_regex})
(?:[, -]*(?:{numberwords_regex}))*
(?!\s?{number_pattern_no_groups}) # Disallow being followed by only a
# number
"""
Expand Down
10 changes: 10 additions & 0 deletions quantulum3/_lang/en_US/tests/quantities.json
Original file line number Diff line number Diff line change
Expand Up @@ -1351,5 +1351,15 @@
{
"req": "I sent some money using a PayPal account",
"res": []
},
{
"req": "three million, two hundred & forty, you say?",
"res": [
{
"value": 3000240,
"unit": "dimensionless",
"surface": "three million, two hundred & forty"
}
]
}
]
4 changes: 3 additions & 1 deletion quantulum3/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ def numberwords(lang="en_US"):

@cached
def numberwords_regex(lang="en_US"):
all_numbers = r"|".join(r"\b%s\b" % i for i in list(numberwords(lang).keys()) if i)
all_numbers = r"|".join(
r"((?<=\W)|^)%s((?=\W)|$)" % i for i in list(numberwords(lang).keys()) if i
)
return all_numbers


Expand Down

0 comments on commit 99278b4

Please sign in to comment.