Removing dependency on pandas (#23)

It was only used to read CSVs, but it's a very heavyweight library, and the built-in csv module achieves the same thing.
mtlynch · May 1, 2018 · 14d9b81 · 14d9b81
1 parent bd90834
commit 14d9b81
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 14 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM mtlynch/crfpp:pandas
+FROM mtlynch/crfpp
 LABEL maintainer="Michael Lynch <michael@mtlynch.io>"
 
 ARG BUILD_DATE

diff --git a/ingredient_phrase_tagger/training/cli.py b/ingredient_phrase_tagger/training/cli.py
@@ -1,5 +1,5 @@
+import csv
 import optparse
-import pandas as pd
 
 import translator
 
@@ -18,20 +18,23 @@ def generate_data(self, count, offset):
         Generates training data in the CRF++ format for the ingredient
         tagging task
         """
-        df = pd.read_csv(self.opts.data_path)
-        df = df.fillna("")
 
         start = int(offset)
         end = int(offset) + int(count)
 
-        df_slice = df.iloc[start:end]
+        with open(self.opts.data_path) as csv_file:
+            csv_reader = csv.DictReader(csv_file)
+            for index, row in enumerate(csv_reader):
+                if index < start or index >= end:
+                    continue
 
-        for index, row in df_slice.iterrows():
-            try:
-                print translator.translate_row(row)
-            # ToDo: deal with this
-            except UnicodeDecodeError:
-                print ''
+                _coerce_values_to_numbers(row)
+
+                try:
+                    print translator.translate_row(row)
+                # ToDo: deal with this
+                except UnicodeDecodeError:
+                    print ''
 
     def _parse_args(self, argv):
         """
@@ -49,3 +52,21 @@ def _parse_args(self, argv):
 
         (options, args) = opts.parse_args(argv)
         return options
+
+
+def _coerce_values_to_numbers(row):
+    """Converts string values in a row to numbers where possible.
+
+    Args:
+        row: A row of labelled ingredient data. This is modified in place so
+            that any of its values that contain a number (e.g. "6.4") are
+            converted to floats and the 'index' value is converted to an int.
+    """
+    for key in row:
+        if key == 'index':
+            row[key] = int(row[key])
+        else:
+            try:
+                row[key] = float(row[key])
+            except ValueError:
+                pass
diff --git a/ingredient_phrase_tagger/training/translator.py b/ingredient_phrase_tagger/training/translator.py
@@ -77,7 +77,12 @@ def _matchUp(token, ingredientRow):
     token = utils.normalizeToken(token)
     decimalToken = _parseNumbers(token)
 
-    for key, val in ingredientRow.iteritems():
+    # Note: We iterate in this specific order to preserve parity with the
+    # legacy implementation. The legacy implementation is likely incorrect and
+    # shouldn't actually include 'index', but we will revisit when we're ready
+    # to change behavior.
+    for key in ['index', 'name', 'qty', 'range_end', 'unit', 'comment']:
+        val = ingredientRow[key]
         if isinstance(val, basestring):
 
             for n, vt in enumerate(utils.tokenize(val)):

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1 @@
 Unidecode==1.0.22
-pandas==0.22.0
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 import ingredient_phrase_tagger
 
-requires, extra = ['Unidecode==1.0.22', 'pandas==0.22.0'], {}
+requires, extra = ['Unidecode==1.0.22'], {}
 if sys.version_info >= (3,):
     extra['use_2to3'] = True
 

diff --git a/tests/test_translator.py b/tests/test_translator.py
@@ -124,5 +124,48 @@ def test_translates_complex_row(self):
 squash\tI15\tL20\tNoCAP\tNoPAREN\tB-NAME
 ,\tI16\tL20\tNoCAP\tNoPAREN\tOTHER
 defrosted\tI17\tL20\tNoCAP\tNoPAREN\tI-COMMENT
+""".strip(),
+                                  translator.translate_row(row).strip())
+
+    def test_translates_row_with_multiple_ingredients(self):
+        row = {
+            'index':
+            16096,
+            'input': ('4 to 6 tablespoons fresh lime juice, as needed, plus '
+                      '4 to 6 slices of lime, for garnish'),
+            'name': ('fresh lime juice, as needed, plus 4 to 6 slices of '
+                     'lime, for garnish'),
+            'qty':
+            4.0,
+            'range_end':
+            6.0,
+            'unit':
+            'tablespoon',
+            'comment':
+            '',
+        }
+
+        self.assertMultiLineEqual("""
+4\tI1\tLX\tNoCAP\tNoPAREN\tB-NAME
+to\tI2\tLX\tNoCAP\tNoPAREN\tI-NAME
+6\tI3\tLX\tNoCAP\tNoPAREN\tI-NAME
+tablespoons\tI4\tLX\tNoCAP\tNoPAREN\tB-UNIT
+fresh\tI5\tLX\tNoCAP\tNoPAREN\tB-NAME
+lime\tI6\tLX\tNoCAP\tNoPAREN\tI-NAME
+juice\tI7\tLX\tNoCAP\tNoPAREN\tI-NAME
+,\tI8\tLX\tNoCAP\tNoPAREN\tI-NAME
+as\tI9\tLX\tNoCAP\tNoPAREN\tI-NAME
+needed\tI10\tLX\tNoCAP\tNoPAREN\tI-NAME
+,\tI11\tLX\tNoCAP\tNoPAREN\tI-NAME
+plus\tI12\tLX\tNoCAP\tNoPAREN\tI-NAME
+4\tI13\tLX\tNoCAP\tNoPAREN\tI-NAME
+to\tI14\tLX\tNoCAP\tNoPAREN\tI-NAME
+6\tI15\tLX\tNoCAP\tNoPAREN\tI-NAME
+slices\tI16\tLX\tNoCAP\tNoPAREN\tI-NAME
+of\tI17\tLX\tNoCAP\tNoPAREN\tI-NAME
+lime\tI18\tLX\tNoCAP\tNoPAREN\tI-NAME
+,\tI19\tLX\tNoCAP\tNoPAREN\tI-NAME
+for\tI20\tLX\tNoCAP\tNoPAREN\tI-NAME
+garnish\tI21\tLX\tNoCAP\tNoPAREN\tI-NAME
 """.strip(),
                                   translator.translate_row(row).strip())