Skip to content

Commit

Permalink
Removing dependency on pandas (#23)
Browse files Browse the repository at this point in the history
It was only used to read CSVs, but it's a very heavyweight library, and the
built-in csv module achieves the same thing.
  • Loading branch information
mtlynch committed May 1, 2018
1 parent bd90834 commit 14d9b81
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 14 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM mtlynch/crfpp:pandas
FROM mtlynch/crfpp
LABEL maintainer="Michael Lynch <michael@mtlynch.io>"

ARG BUILD_DATE
Expand Down
41 changes: 31 additions & 10 deletions ingredient_phrase_tagger/training/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import csv
import optparse
import pandas as pd

import translator

Expand All @@ -18,20 +18,23 @@ def generate_data(self, count, offset):
Generates training data in the CRF++ format for the ingredient
tagging task
"""
df = pd.read_csv(self.opts.data_path)
df = df.fillna("")

start = int(offset)
end = int(offset) + int(count)

df_slice = df.iloc[start:end]
with open(self.opts.data_path) as csv_file:
csv_reader = csv.DictReader(csv_file)
for index, row in enumerate(csv_reader):
if index < start or index >= end:
continue

for index, row in df_slice.iterrows():
try:
print translator.translate_row(row)
# ToDo: deal with this
except UnicodeDecodeError:
print ''
_coerce_values_to_numbers(row)

try:
print translator.translate_row(row)
# ToDo: deal with this
except UnicodeDecodeError:
print ''

def _parse_args(self, argv):
"""
Expand All @@ -49,3 +52,21 @@ def _parse_args(self, argv):

(options, args) = opts.parse_args(argv)
return options


def _coerce_values_to_numbers(row):
"""Converts string values in a row to numbers where possible.
Args:
row: A row of labelled ingredient data. This is modified in place so
that any of its values that contain a number (e.g. "6.4") are
converted to floats and the 'index' value is converted to an int.
"""
for key in row:
if key == 'index':
row[key] = int(row[key])
else:
try:
row[key] = float(row[key])
except ValueError:
pass
7 changes: 6 additions & 1 deletion ingredient_phrase_tagger/training/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,12 @@ def _matchUp(token, ingredientRow):
token = utils.normalizeToken(token)
decimalToken = _parseNumbers(token)

for key, val in ingredientRow.iteritems():
# Note: We iterate in this specific order to preserve parity with the
# legacy implementation. The legacy implementation is likely incorrect and
# shouldn't actually include 'index', but we will revisit when we're ready
# to change behavior.
for key in ['index', 'name', 'qty', 'range_end', 'unit', 'comment']:
val = ingredientRow[key]
if isinstance(val, basestring):

for n, vt in enumerate(utils.tokenize(val)):
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
Unidecode==1.0.22
pandas==0.22.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import ingredient_phrase_tagger

requires, extra = ['Unidecode==1.0.22', 'pandas==0.22.0'], {}
requires, extra = ['Unidecode==1.0.22'], {}
if sys.version_info >= (3,):
extra['use_2to3'] = True

Expand Down
43 changes: 43 additions & 0 deletions tests/test_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,5 +124,48 @@ def test_translates_complex_row(self):
squash\tI15\tL20\tNoCAP\tNoPAREN\tB-NAME
,\tI16\tL20\tNoCAP\tNoPAREN\tOTHER
defrosted\tI17\tL20\tNoCAP\tNoPAREN\tI-COMMENT
""".strip(),
translator.translate_row(row).strip())

def test_translates_row_with_multiple_ingredients(self):
row = {
'index':
16096,
'input': ('4 to 6 tablespoons fresh lime juice, as needed, plus '
'4 to 6 slices of lime, for garnish'),
'name': ('fresh lime juice, as needed, plus 4 to 6 slices of '
'lime, for garnish'),
'qty':
4.0,
'range_end':
6.0,
'unit':
'tablespoon',
'comment':
'',
}

self.assertMultiLineEqual("""
4\tI1\tLX\tNoCAP\tNoPAREN\tB-NAME
to\tI2\tLX\tNoCAP\tNoPAREN\tI-NAME
6\tI3\tLX\tNoCAP\tNoPAREN\tI-NAME
tablespoons\tI4\tLX\tNoCAP\tNoPAREN\tB-UNIT
fresh\tI5\tLX\tNoCAP\tNoPAREN\tB-NAME
lime\tI6\tLX\tNoCAP\tNoPAREN\tI-NAME
juice\tI7\tLX\tNoCAP\tNoPAREN\tI-NAME
,\tI8\tLX\tNoCAP\tNoPAREN\tI-NAME
as\tI9\tLX\tNoCAP\tNoPAREN\tI-NAME
needed\tI10\tLX\tNoCAP\tNoPAREN\tI-NAME
,\tI11\tLX\tNoCAP\tNoPAREN\tI-NAME
plus\tI12\tLX\tNoCAP\tNoPAREN\tI-NAME
4\tI13\tLX\tNoCAP\tNoPAREN\tI-NAME
to\tI14\tLX\tNoCAP\tNoPAREN\tI-NAME
6\tI15\tLX\tNoCAP\tNoPAREN\tI-NAME
slices\tI16\tLX\tNoCAP\tNoPAREN\tI-NAME
of\tI17\tLX\tNoCAP\tNoPAREN\tI-NAME
lime\tI18\tLX\tNoCAP\tNoPAREN\tI-NAME
,\tI19\tLX\tNoCAP\tNoPAREN\tI-NAME
for\tI20\tLX\tNoCAP\tNoPAREN\tI-NAME
garnish\tI21\tLX\tNoCAP\tNoPAREN\tI-NAME
""".strip(),
translator.translate_row(row).strip())

0 comments on commit 14d9b81

Please sign in to comment.