forked from nytimes/ingredient-phrase-tagger
-
Notifications
You must be signed in to change notification settings - Fork 76
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactoring tokenizer and adding unit tests (#31)
- Loading branch information
Showing
4 changed files
with
58 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import re | ||
|
||
import utils | ||
|
||
|
||
def tokenize(s): | ||
""" | ||
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash. | ||
We sometimes give American units and metric units for baking recipes. For example: | ||
* 2 tablespoons/30 mililiters milk or cream | ||
* 2 1/2 cups/300 grams all-purpose flour | ||
The recipe database only allows for one unit, and we want to use the American one. | ||
But we must split the text on "cups/" etc. in order to pick it up. | ||
""" | ||
|
||
# handle abbreviation like "100g" by treating it as "100 grams" | ||
s = re.sub(r'(\d+)g', r'\1 grams', s) | ||
s = re.sub(r'(\d+)oz', r'\1 ounces', s) | ||
|
||
american_units = [ | ||
'cup', 'tablespoon', 'teaspoon', 'pound', 'ounce', 'quart', 'pint' | ||
] | ||
for unit in american_units: | ||
s = s.replace(unit + '/', unit + ' ') | ||
s = s.replace(unit + 's/', unit + 's ') | ||
|
||
return filter(None, re.split(r'([,\(\)])?\s*', utils.clumpFractions(s))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import unittest | ||
|
||
from ingredient_phrase_tagger.training import tokenizer | ||
|
||
|
||
class TokenizerTest(unittest.TestCase): | ||
|
||
def test_tokenizer_splits_slash_separated_alternatives(self): | ||
pairs = [('2 tablespoons/30 milliliters milk or cream', [ | ||
'2', 'tablespoons', '30', 'milliliters', 'milk', 'or', 'cream' | ||
]), ('2 1/2 cups/300 grams all-purpose flour', | ||
['2$1/2', 'cups', '300', 'grams', 'all-purpose', 'flour'])] | ||
for ingredient, tokens_expected in pairs: | ||
tokens_actual = tokenizer.tokenize(ingredient) | ||
self.assertEqual(tokens_expected, tokens_actual) | ||
|
||
def test_tokenizer_expands_unit_abbreviations(self): | ||
pairs = [ | ||
('100g melted chocolate', ['100', 'grams', 'melted', 'chocolate']), | ||
('8oz diet coke', ['8', 'ounces', 'diet', 'coke']), | ||
] | ||
for ingredient, tokens_expected in pairs: | ||
tokens_actual = tokenizer.tokenize(ingredient) | ||
self.assertEqual(tokens_expected, tokens_actual) |