diff --git a/README.md b/README.md index 83065d9..ed42588 100644 --- a/README.md +++ b/README.md @@ -78,68 +78,48 @@ docker pull mtlynch/ingredient-phrase-tagger ## Quick Start -The most common usage is to train the model with a subset of our data, test the -model against a different subset, then visualize the results. We provide a shell -script to do this, at: +To begin, you must train a model: - ./roundtrip.sh - -You can edit this script to specify the size of your training and testing set. -The default is 20k training examples and 2k test examples. - - -## Usage - -### Training - -To train the model, we must first convert our input data into a format which -`crf_learn` can accept: - - bin/generate_data --data-path=input.csv --count=1000 --offset=0 > tmp/train_file - -The `count` argument specifies the number of training examples (i.e. ingredient -lines) to read, and `offset` specifies which line to start with. There are -roughly 180k examples in our snapshot of the New York Times cooking database -(which we include in this repo), so it is useful to run against a subset. - -The output of this step looks something like: - - 1 I1 L8 NoCAP NoPAREN B-QTY - cup I2 L8 NoCAP NoPAREN B-UNIT - white I3 L8 NoCAP NoPAREN B-NAME - wine I4 L8 NoCAP NoPAREN I-NAME - - 1/2 I1 L4 NoCAP NoPAREN B-QTY - cup I2 L4 NoCAP NoPAREN B-UNIT - sugar I3 L4 NoCAP NoPAREN B-NAME - - 2 I1 L8 NoCAP NoPAREN B-QTY - tablespoons I2 L8 NoCAP NoPAREN B-UNIT - dry I3 L8 NoCAP NoPAREN B-NAME - white I4 L8 NoCAP NoPAREN I-NAME - wine I5 L8 NoCAP NoPAREN I-NAME - -Next, we pass this file to `crf_learn`, to generate a model file: - - crf_learn template_file tmp/train_file tmp/model_file - - -### Testing - -To use the model to tag your own arbitrary ingredient lines (stored here in -`input.txt`), you must first convert it into the CRF++ format, then run against -the model file which we generated above. We provide another helper script to do -this: - - python bin/parse-ingredients.py input.txt > results.txt - -The output is also in CRF++ format, which isn't terribly helpful to us. To -convert it into JSON: +```bash +MODEL_DIR=$(mktemp -d) +./docker_train_prod_model $MODEL_DIR +MODEL_FILE=$(find $MODEL_DIR -name '*.crfmodel') +``` - python bin/convert-to-json.py results.txt > results.json +From there, you can convert ingredients by piping them into stdin: -See the top of this README for an example of the expected output. +```bash +echo ' +2 tablespoons honey +1/2 cup flour +Black pepper, to taste' | bin/parse-ingredients.py --model-file $MODEL_FILE +``` +```text +[ + { + "display": "2tablespoonshoney", + "input": "2 tablespoons honey", + "name": "honey", + "qty": "2", + "unit": "tablespoon" + }, + { + "display": "1/2cupflour", + "input": "1/2 cup flour", + "name": "flour", + "qty": "1/2", + "unit": "cup" + }, + { + "comment": "to taste", + "display": "Black pepper,to taste", + "input": "Black pepper, to taste", + "name": "Black pepper", + "other": "," + } +] +``` ## Authors diff --git a/bin/convert-to-json.py b/bin/convert-to-json.py deleted file mode 100755 index 9ac77ca..0000000 --- a/bin/convert-to-json.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function - -import sys -import json - -from ingredient_phrase_tagger.training import utils - -if len(sys.argv) < 2: - sys.stderr.write('Usage: convert-to-json.py FILENAME') - sys.exit(1) - -print(json.dumps(utils.import_data(open(sys.argv[1])), indent=4)) diff --git a/bin/parse-ingredients.py b/bin/parse-ingredients.py index 92a641b..9cdafca 100755 --- a/bin/parse-ingredients.py +++ b/bin/parse-ingredients.py @@ -1,23 +1,36 @@ #!/usr/bin/env python -from __future__ import print_function +import argparse +import json import sys -import os +import subprocess import tempfile from ingredient_phrase_tagger.training import utils -if len(sys.argv) < 2: - sys.stderr.write('Usage: parse-ingredients.py FILENAME') - sys.exit(1) -FILENAME = str(sys.argv[1]) -_, tmpFile = tempfile.mkstemp() +def _exec_crf_test(input_text, model_path): + with tempfile.NamedTemporaryFile() as input_file: + input_file.write(utils.export_data(input_text)) + input_file.flush() + return subprocess.check_output( + ['crf_test', '--verbose=1', '--model', model_path, + input_file.name]).decode('utf-8') -with open(FILENAME) as infile, open(tmpFile, 'w') as outfile: - outfile.write(utils.export_data(infile.readlines())) -tmpFilePath = "../tmp/model_file" -modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath) -os.system("crf_test -v 1 -m %s %s" % (modelFilename, tmpFile)) -os.system("rm %s" % tmpFile) +def _convert_crf_output_to_json(crf_output): + return json.dumps(utils.import_data(crf_output), indent=2, sort_keys=True) + + +def main(args): + raw_ingredient_lines = [x for x in sys.stdin.readlines() if x] + crf_output = _exec_crf_test(raw_ingredient_lines, args.model_file) + print _convert_crf_output_to_json(crf_output.split('\n')) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + prog='Ingredient Phrase Tagger', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('-m', '--model-file', required=True) + main(parser.parse_args())