Skip to content

Commit

Permalink
Refactoring tokenizer and adding unit tests (#31)
Browse files Browse the repository at this point in the history
  • Loading branch information
mtlynch committed May 1, 2018
1 parent d3d8c20 commit 81b71a5
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 28 deletions.
29 changes: 29 additions & 0 deletions ingredient_phrase_tagger/training/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import re

import utils


def tokenize(s):
"""
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
We sometimes give American units and metric units for baking recipes. For example:
* 2 tablespoons/30 mililiters milk or cream
* 2 1/2 cups/300 grams all-purpose flour
The recipe database only allows for one unit, and we want to use the American one.
But we must split the text on "cups/" etc. in order to pick it up.
"""

# handle abbreviation like "100g" by treating it as "100 grams"
s = re.sub(r'(\d+)g', r'\1 grams', s)
s = re.sub(r'(\d+)oz', r'\1 ounces', s)

american_units = [
'cup', 'tablespoon', 'teaspoon', 'pound', 'ounce', 'quart', 'pint'
]
for unit in american_units:
s = s.replace(unit + '/', unit + ' ')
s = s.replace(unit + 's/', unit + 's ')

return filter(None, re.split(r'([,\(\)])?\s*', utils.clumpFractions(s)))
5 changes: 3 additions & 2 deletions ingredient_phrase_tagger/training/translator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import decimal
import re

import tokenizer
import utils


Expand All @@ -19,7 +20,7 @@ def translate_row(row):
"""
# extract the display name
display_input = utils.cleanUnicodeFractions(row['input'])
tokens = utils.tokenize(display_input)
tokens = tokenizer.tokenize(display_input)

labels = _row_to_labels(row)
label_data = _addPrefixes([(t, _matchUp(t, labels)) for t in tokens])
Expand Down Expand Up @@ -97,7 +98,7 @@ def _matchUp(token, labels):
for label_key in ['name', 'unit', 'qty', 'comment', 'range_end']:
label_value = labels[label_key]
if isinstance(label_value, basestring):
for n, vt in enumerate(utils.tokenize(label_value)):
for n, vt in enumerate(tokenizer.tokenize(label_value)):
if utils.normalizeToken(vt) == token:
ret.append(label_key.upper())

Expand Down
28 changes: 2 additions & 26 deletions ingredient_phrase_tagger/training/utils.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,7 @@
#!/usr/bin/env python
import re


def tokenize(s):
"""
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
We sometimes give American units and metric units for baking recipes. For example:
* 2 tablespoons/30 mililiters milk or cream
* 2 1/2 cups/300 grams all-purpose flour
The recipe database only allows for one unit, and we want to use the American one.
But we must split the text on "cups/" etc. in order to pick it up.
"""

# handle abbreviation like "100g" by treating it as "100 grams"
s = re.sub(r'(\d+)g', r'\1 grams', s)
s = re.sub(r'(\d+)oz', r'\1 ounces', s)

american_units = [
'cup', 'tablespoon', 'teaspoon', 'pound', 'ounce', 'quart', 'pint'
]
for unit in american_units:
s = s.replace(unit + '/', unit + ' ')
s = s.replace(unit + 's/', unit + 's ')

return filter(None, re.split(r'([,\(\)])?\s*', clumpFractions(s)))
import tokenizer


def joinLine(columns):
Expand Down Expand Up @@ -313,7 +289,7 @@ def export_data(lines):
output = []
for line in lines:
line_clean = re.sub('<[^<]+?>', '', line)
tokens = tokenize(line_clean)
tokens = tokenizer.tokenize(line_clean)

for i, token in enumerate(tokens):
features = getFeatures(token, i + 1, tokens)
Expand Down
24 changes: 24 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import unittest

from ingredient_phrase_tagger.training import tokenizer


class TokenizerTest(unittest.TestCase):

def test_tokenizer_splits_slash_separated_alternatives(self):
pairs = [('2 tablespoons/30 milliliters milk or cream', [
'2', 'tablespoons', '30', 'milliliters', 'milk', 'or', 'cream'
]), ('2 1/2 cups/300 grams all-purpose flour',
['2$1/2', 'cups', '300', 'grams', 'all-purpose', 'flour'])]
for ingredient, tokens_expected in pairs:
tokens_actual = tokenizer.tokenize(ingredient)
self.assertEqual(tokens_expected, tokens_actual)

def test_tokenizer_expands_unit_abbreviations(self):
pairs = [
('100g melted chocolate', ['100', 'grams', 'melted', 'chocolate']),
('8oz diet coke', ['8', 'ounces', 'diet', 'coke']),
]
for ingredient, tokens_expected in pairs:
tokens_actual = tokenizer.tokenize(ingredient)
self.assertEqual(tokens_expected, tokens_actual)

0 comments on commit 81b71a5

Please sign in to comment.