Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing dependency on pandas #23

Merged
merged 1 commit into from
May 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM mtlynch/crfpp:pandas
FROM mtlynch/crfpp
LABEL maintainer="Michael Lynch <michael@mtlynch.io>"

ARG BUILD_DATE
Expand Down
41 changes: 31 additions & 10 deletions ingredient_phrase_tagger/training/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import csv
import optparse
import pandas as pd

import translator

Expand All @@ -18,20 +18,23 @@ def generate_data(self, count, offset):
Generates training data in the CRF++ format for the ingredient
tagging task
"""
df = pd.read_csv(self.opts.data_path)
df = df.fillna("")

start = int(offset)
end = int(offset) + int(count)

df_slice = df.iloc[start:end]
with open(self.opts.data_path) as csv_file:
csv_reader = csv.DictReader(csv_file)
for index, row in enumerate(csv_reader):
if index < start or index >= end:
continue

for index, row in df_slice.iterrows():
try:
print translator.translate_row(row)
# ToDo: deal with this
except UnicodeDecodeError:
print ''
_coerce_values_to_numbers(row)

try:
print translator.translate_row(row)
# ToDo: deal with this
except UnicodeDecodeError:
print ''

def _parse_args(self, argv):
"""
Expand All @@ -49,3 +52,21 @@ def _parse_args(self, argv):

(options, args) = opts.parse_args(argv)
return options


def _coerce_values_to_numbers(row):
"""Converts string values in a row to numbers where possible.

Args:
row: A row of labelled ingredient data. This is modified in place so
that any of its values that contain a number (e.g. "6.4") are
converted to floats and the 'index' value is converted to an int.
"""
for key in row:
if key == 'index':
row[key] = int(row[key])
else:
try:
row[key] = float(row[key])
except ValueError:
pass
7 changes: 6 additions & 1 deletion ingredient_phrase_tagger/training/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,12 @@ def _matchUp(token, ingredientRow):
token = utils.normalizeToken(token)
decimalToken = _parseNumbers(token)

for key, val in ingredientRow.iteritems():
# Note: We iterate in this specific order to preserve parity with the
# legacy implementation. The legacy implementation is likely incorrect and
# shouldn't actually include 'index', but we will revisit when we're ready
# to change behavior.
for key in ['index', 'name', 'qty', 'range_end', 'unit', 'comment']:
val = ingredientRow[key]
if isinstance(val, basestring):

for n, vt in enumerate(utils.tokenize(val)):
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
Unidecode==1.0.22
pandas==0.22.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import ingredient_phrase_tagger

requires, extra = ['Unidecode==1.0.22', 'pandas==0.22.0'], {}
requires, extra = ['Unidecode==1.0.22'], {}
if sys.version_info >= (3,):
extra['use_2to3'] = True

Expand Down
43 changes: 43 additions & 0 deletions tests/test_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,5 +124,48 @@ def test_translates_complex_row(self):
squash\tI15\tL20\tNoCAP\tNoPAREN\tB-NAME
,\tI16\tL20\tNoCAP\tNoPAREN\tOTHER
defrosted\tI17\tL20\tNoCAP\tNoPAREN\tI-COMMENT
""".strip(),
translator.translate_row(row).strip())

def test_translates_row_with_multiple_ingredients(self):
row = {
'index':
16096,
'input': ('4 to 6 tablespoons fresh lime juice, as needed, plus '
'4 to 6 slices of lime, for garnish'),
'name': ('fresh lime juice, as needed, plus 4 to 6 slices of '
'lime, for garnish'),
'qty':
4.0,
'range_end':
6.0,
'unit':
'tablespoon',
'comment':
'',
}

self.assertMultiLineEqual("""
4\tI1\tLX\tNoCAP\tNoPAREN\tB-NAME
to\tI2\tLX\tNoCAP\tNoPAREN\tI-NAME
6\tI3\tLX\tNoCAP\tNoPAREN\tI-NAME
tablespoons\tI4\tLX\tNoCAP\tNoPAREN\tB-UNIT
fresh\tI5\tLX\tNoCAP\tNoPAREN\tB-NAME
lime\tI6\tLX\tNoCAP\tNoPAREN\tI-NAME
juice\tI7\tLX\tNoCAP\tNoPAREN\tI-NAME
,\tI8\tLX\tNoCAP\tNoPAREN\tI-NAME
as\tI9\tLX\tNoCAP\tNoPAREN\tI-NAME
needed\tI10\tLX\tNoCAP\tNoPAREN\tI-NAME
,\tI11\tLX\tNoCAP\tNoPAREN\tI-NAME
plus\tI12\tLX\tNoCAP\tNoPAREN\tI-NAME
4\tI13\tLX\tNoCAP\tNoPAREN\tI-NAME
to\tI14\tLX\tNoCAP\tNoPAREN\tI-NAME
6\tI15\tLX\tNoCAP\tNoPAREN\tI-NAME
slices\tI16\tLX\tNoCAP\tNoPAREN\tI-NAME
of\tI17\tLX\tNoCAP\tNoPAREN\tI-NAME
lime\tI18\tLX\tNoCAP\tNoPAREN\tI-NAME
,\tI19\tLX\tNoCAP\tNoPAREN\tI-NAME
for\tI20\tLX\tNoCAP\tNoPAREN\tI-NAME
garnish\tI21\tLX\tNoCAP\tNoPAREN\tI-NAME
""".strip(),
translator.translate_row(row).strip())