Experimenting with more tokenizing logic

mtlynch · May 1, 2018 · 31165cf · 31165cf
1 parent 807613a
commit 31165cf
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 6 deletions.
diff --git a/ingredient_phrase_tagger/training/tokenizer.py b/ingredient_phrase_tagger/training/tokenizer.py
@@ -8,22 +8,34 @@ def tokenize(s):
     Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
 
     We sometimes give American units and metric units for baking recipes. For example:
-        * 2 tablespoons/30 mililiters milk or cream
+        * 2 tablespoons/30 milliliters milk or cream
         * 2 1/2 cups/300 grams all-purpose flour
 
     The recipe database only allows for one unit, and we want to use the American one.
     But we must split the text on "cups/" etc. in order to pick it up.
     """
 
-    # handle abbreviation like "100g" by treating it as "100 grams"
-    s = re.sub(r'(\d+)g', r'\1 grams', s)
-    s = re.sub(r'(\d+)oz', r'\1 ounces', s)
+    s = _expand_unit_abbreviations(s)
+    s = _normalize_us_uk_split(s)
 
+    return filter(None, re.split(r'([,\(\)])?\s*', utils.clumpFractions(s)))
+
+
+def _expand_unit_abbreviations(s):
+    s = re.sub(r'(\d+)g\.?', r'\1 grams', s)
+    s = re.sub(r'(\d+)oz\.?', r'\1 ounces', s)
+    s = re.sub(r'(\d+)lbs?\.?', r'\1 pounds', s)
+    s = re.sub(r'(\d+)ml\.?', r'\1 milliliters', s)
+    s = re.sub(r'(\d+)tsp\.?', r'\1 teaspoons', s)
+    s = re.sub(r'(\d+)tbsp\.?', r'\1 tablespoons', s)
+    return s
+
+
+def _normalize_us_uk_split(s):
     american_units = [
         'cup', 'tablespoon', 'teaspoon', 'pound', 'ounce', 'quart', 'pint'
     ]
     for unit in american_units:
         s = s.replace(unit + '/', unit + ' ')
         s = s.replace(unit + 's/', unit + 's ')
-
-    return filter(None, re.split(r'([,\(\)])?\s*', utils.clumpFractions(s)))
+    return s
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -18,6 +18,12 @@ def test_tokenizer_expands_unit_abbreviations(self):
         pairs = [
             ('100g melted chocolate', ['100', 'grams', 'melted', 'chocolate']),
             ('8oz diet coke', ['8', 'ounces', 'diet', 'coke']),
+            ('16oz. of coconut oil', ['16', 'ounces', 'of', 'coconut', 'oil']),
+            ('5lbs  yellow butter', ['5', 'pounds', 'yellow', 'butter']),
+            ('15lb. chicken', ['15', 'pounds', 'chicken']),
+            ('5ml corn sugar', ['5', 'milliliters', 'corn', 'sugar']),
+            ('4tsp sugar', ['4', 'teaspoons', 'sugar']),
+            ('2tbsp cinnamon', ['2', 'tablespoons', 'cinnamon']),
         ]
         for ingredient, tokens_expected in pairs:
             tokens_actual = tokenizer.tokenize(ingredient)