Skip to content

Commit

Permalink
Experimenting with more tokenizing logic
Browse files Browse the repository at this point in the history
  • Loading branch information
mtlynch committed May 1, 2018
1 parent 807613a commit 31165cf
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 6 deletions.
24 changes: 18 additions & 6 deletions ingredient_phrase_tagger/training/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,34 @@ def tokenize(s):
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
We sometimes give American units and metric units for baking recipes. For example:
* 2 tablespoons/30 mililiters milk or cream
* 2 tablespoons/30 milliliters milk or cream
* 2 1/2 cups/300 grams all-purpose flour
The recipe database only allows for one unit, and we want to use the American one.
But we must split the text on "cups/" etc. in order to pick it up.
"""

# handle abbreviation like "100g" by treating it as "100 grams"
s = re.sub(r'(\d+)g', r'\1 grams', s)
s = re.sub(r'(\d+)oz', r'\1 ounces', s)
s = _expand_unit_abbreviations(s)
s = _normalize_us_uk_split(s)

return filter(None, re.split(r'([,\(\)])?\s*', utils.clumpFractions(s)))


def _expand_unit_abbreviations(s):
s = re.sub(r'(\d+)g\.?', r'\1 grams', s)
s = re.sub(r'(\d+)oz\.?', r'\1 ounces', s)
s = re.sub(r'(\d+)lbs?\.?', r'\1 pounds', s)
s = re.sub(r'(\d+)ml\.?', r'\1 milliliters', s)
s = re.sub(r'(\d+)tsp\.?', r'\1 teaspoons', s)
s = re.sub(r'(\d+)tbsp\.?', r'\1 tablespoons', s)
return s


def _normalize_us_uk_split(s):
american_units = [
'cup', 'tablespoon', 'teaspoon', 'pound', 'ounce', 'quart', 'pint'
]
for unit in american_units:
s = s.replace(unit + '/', unit + ' ')
s = s.replace(unit + 's/', unit + 's ')

return filter(None, re.split(r'([,\(\)])?\s*', utils.clumpFractions(s)))
return s
6 changes: 6 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ def test_tokenizer_expands_unit_abbreviations(self):
pairs = [
('100g melted chocolate', ['100', 'grams', 'melted', 'chocolate']),
('8oz diet coke', ['8', 'ounces', 'diet', 'coke']),
('16oz. of coconut oil', ['16', 'ounces', 'of', 'coconut', 'oil']),
('5lbs yellow butter', ['5', 'pounds', 'yellow', 'butter']),
('15lb. chicken', ['15', 'pounds', 'chicken']),
('5ml corn sugar', ['5', 'milliliters', 'corn', 'sugar']),
('4tsp sugar', ['4', 'teaspoons', 'sugar']),
('2tbsp cinnamon', ['2', 'tablespoons', 'cinnamon']),
]
for ingredient, tokens_expected in pairs:
tokens_actual = tokenizer.tokenize(ingredient)
Expand Down

0 comments on commit 31165cf

Please sign in to comment.