diff --git a/README.md b/README.md
index 83065d9..ed42588 100644
--- a/README.md
+++ b/README.md
@@ -78,68 +78,48 @@ docker pull mtlynch/ingredient-phrase-tagger
## Quick Start
-The most common usage is to train the model with a subset of our data, test the
-model against a different subset, then visualize the results. We provide a shell
-script to do this, at:
+To begin, you must train a model:
- ./roundtrip.sh
-
-You can edit this script to specify the size of your training and testing set.
-The default is 20k training examples and 2k test examples.
-
-
-## Usage
-
-### Training
-
-To train the model, we must first convert our input data into a format which
-`crf_learn` can accept:
-
- bin/generate_data --data-path=input.csv --count=1000 --offset=0 > tmp/train_file
-
-The `count` argument specifies the number of training examples (i.e. ingredient
-lines) to read, and `offset` specifies which line to start with. There are
-roughly 180k examples in our snapshot of the New York Times cooking database
-(which we include in this repo), so it is useful to run against a subset.
-
-The output of this step looks something like:
-
- 1 I1 L8 NoCAP NoPAREN B-QTY
- cup I2 L8 NoCAP NoPAREN B-UNIT
- white I3 L8 NoCAP NoPAREN B-NAME
- wine I4 L8 NoCAP NoPAREN I-NAME
-
- 1/2 I1 L4 NoCAP NoPAREN B-QTY
- cup I2 L4 NoCAP NoPAREN B-UNIT
- sugar I3 L4 NoCAP NoPAREN B-NAME
-
- 2 I1 L8 NoCAP NoPAREN B-QTY
- tablespoons I2 L8 NoCAP NoPAREN B-UNIT
- dry I3 L8 NoCAP NoPAREN B-NAME
- white I4 L8 NoCAP NoPAREN I-NAME
- wine I5 L8 NoCAP NoPAREN I-NAME
-
-Next, we pass this file to `crf_learn`, to generate a model file:
-
- crf_learn template_file tmp/train_file tmp/model_file
-
-
-### Testing
-
-To use the model to tag your own arbitrary ingredient lines (stored here in
-`input.txt`), you must first convert it into the CRF++ format, then run against
-the model file which we generated above. We provide another helper script to do
-this:
-
- python bin/parse-ingredients.py input.txt > results.txt
-
-The output is also in CRF++ format, which isn't terribly helpful to us. To
-convert it into JSON:
+```bash
+MODEL_DIR=$(mktemp -d)
+./docker_train_prod_model $MODEL_DIR
+MODEL_FILE=$(find $MODEL_DIR -name '*.crfmodel')
+```
- python bin/convert-to-json.py results.txt > results.json
+From there, you can convert ingredients by piping them into stdin:
-See the top of this README for an example of the expected output.
+```bash
+echo '
+2 tablespoons honey
+1/2 cup flour
+Black pepper, to taste' | bin/parse-ingredients.py --model-file $MODEL_FILE
+```
+```text
+[
+ {
+ "display": "2tablespoonshoney",
+ "input": "2 tablespoons honey",
+ "name": "honey",
+ "qty": "2",
+ "unit": "tablespoon"
+ },
+ {
+ "display": "1/2cupflour",
+ "input": "1/2 cup flour",
+ "name": "flour",
+ "qty": "1/2",
+ "unit": "cup"
+ },
+ {
+ "comment": "to taste",
+ "display": "Black pepper,",
+ "input": "Black pepper, to taste",
+ "name": "Black pepper",
+ "other": ","
+ }
+]
+```
## Authors
diff --git a/bin/convert-to-json.py b/bin/convert-to-json.py
deleted file mode 100755
index 9ac77ca..0000000
--- a/bin/convert-to-json.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python
-from __future__ import print_function
-
-import sys
-import json
-
-from ingredient_phrase_tagger.training import utils
-
-if len(sys.argv) < 2:
- sys.stderr.write('Usage: convert-to-json.py FILENAME')
- sys.exit(1)
-
-print(json.dumps(utils.import_data(open(sys.argv[1])), indent=4))
diff --git a/bin/parse-ingredients.py b/bin/parse-ingredients.py
index 92a641b..9cdafca 100755
--- a/bin/parse-ingredients.py
+++ b/bin/parse-ingredients.py
@@ -1,23 +1,36 @@
#!/usr/bin/env python
-from __future__ import print_function
+import argparse
+import json
import sys
-import os
+import subprocess
import tempfile
from ingredient_phrase_tagger.training import utils
-if len(sys.argv) < 2:
- sys.stderr.write('Usage: parse-ingredients.py FILENAME')
- sys.exit(1)
-FILENAME = str(sys.argv[1])
-_, tmpFile = tempfile.mkstemp()
+def _exec_crf_test(input_text, model_path):
+ with tempfile.NamedTemporaryFile() as input_file:
+ input_file.write(utils.export_data(input_text))
+ input_file.flush()
+ return subprocess.check_output(
+ ['crf_test', '--verbose=1', '--model', model_path,
+ input_file.name]).decode('utf-8')
-with open(FILENAME) as infile, open(tmpFile, 'w') as outfile:
- outfile.write(utils.export_data(infile.readlines()))
-tmpFilePath = "../tmp/model_file"
-modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath)
-os.system("crf_test -v 1 -m %s %s" % (modelFilename, tmpFile))
-os.system("rm %s" % tmpFile)
+def _convert_crf_output_to_json(crf_output):
+ return json.dumps(utils.import_data(crf_output), indent=2, sort_keys=True)
+
+
+def main(args):
+ raw_ingredient_lines = [x for x in sys.stdin.readlines() if x]
+ crf_output = _exec_crf_test(raw_ingredient_lines, args.model_file)
+ print _convert_crf_output_to_json(crf_output.split('\n'))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(
+ prog='Ingredient Phrase Tagger',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('-m', '--model-file', required=True)
+ main(parser.parse_args())