Skip to content

Commit

Permalink
Merge 36de17b into f759d07
Browse files Browse the repository at this point in the history
  • Loading branch information
mtlynch committed May 2, 2018
2 parents f759d07 + 36de17b commit 89ca41d
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 44 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ LABEL org.label-schema.build-date=$BUILD_DATE \
org.label-schema.schema-version="1.0.0-rc1"

RUN apt-get update -y && \
apt-get install -y python2.7 python-pip
apt-get install -y git python2.7 python-pip

ADD . /ingredient-phrase-tagger
WORKDIR /ingredient-phrase-tagger
Expand Down
64 changes: 64 additions & 0 deletions bin/train-model
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash

# Trains a new CRF model given a set of labelled data.

# Exit build script on first failure
set -e

# Echo commands to stdout.
set -x

# Check required vars.
if [ -z "$LABELLED_DATA_FILE" ];
then echo "LABELLED_DATA_FILE must be set" && exit 1;
fi
if [ -z "$LABELLED_EXAMPLE_COUNT" ];
then echo "LABELLED_EXAMPLE_COUNT must be set" && exit 1;
fi
if [ -z "$OUTPUT_DIR" ];
then echo "OUTPUT_DIR must be set" && exit 1;
fi

# Choose defaults for optional vars.
if [ -z "$TRAINING_DATA_PERCENT" ];
then TRAINING_DATA_PERCENT=0.9;
fi
if [ -z "$CRF_TRAINING_THREADS" ];
then CRF_TRAINING_THREADS=2;
fi

COUNT_TRAIN=$(python -c "print int($TRAINING_DATA_PERCENT * $LABELLED_EXAMPLE_COUNT)")
COUNT_TEST=$(python -c "print int((1.0 - $TRAINING_DATA_PERCENT) * $LABELLED_EXAMPLE_COUNT)")

CRF_TRAINING_FILE="${OUTPUT_DIR}/training_data.crf"
CRF_TESTING_FILE="${OUTPUT_DIR}/testing_data.crf"

CRF_LEARN_TEMPLATE=template_file

MODEL_TIMESTAMP=$(date +%Y%m%d_%H%M)
DATA_VERSION="${LABELLED_DATA_FILE/\.csv/}"
CODE_VERSION=$(git rev-parse --short HEAD)

CRF_MODEL_FILE="${OUTPUT_DIR}/${MODEL_TIMESTAMP}-${DATA_VERSION}-${CODE_VERSION}.crfmodel"
TESTING_OUTPUT_FILE="${OUTPUT_DIR}/testing_output"
EVAL_OUTPUT_FILE="${OUTPUT_DIR}/eval_output"

bin/generate_data \
--data-path="$LABELLED_DATA_FILE" \
--count="$COUNT_TRAIN" \
--offset=0 > "$CRF_TRAINING_FILE"

bin/generate_data \
--data-path="$LABELLED_DATA_FILE" \
--count="$COUNT_TEST" \
--offset=$COUNT_TRAIN > "$CRF_TESTING_FILE"

crf_learn \
--thread="$CRF_TRAINING_THREADS" \
"$CRF_LEARN_TEMPLATE" "$CRF_TESTING_FILE" "$CRF_MODEL_FILE"

crf_test \
--model="$CRF_MODEL_FILE" \
"$CRF_TESTING_FILE" > "$TESTING_OUTPUT_FILE"

python bin/evaluate.py "$TESTING_OUTPUT_FILE" > "$EVAL_OUTPUT_FILE"
31 changes: 31 additions & 0 deletions build_prod_model
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

# Builds a new model using production settings.
#
# Example
# $ mkdir ~/model
# $ ./build_prod_model ~/model
#
# Trains a new model using a known set of labelled data then verifies that all
# generated outputs match the golden outputs.

# Exit build script on first failure
set -e

# Echo commands to stdout.
set -x

export LABELLED_DATA_FILE=nyt-ingredients-snapshot-2015.csv
export LABELLED_EXAMPLE_COUNT=179207
export TRAINING_DATA_PERCENT=0.9
if [ -z "$1" ];
then echo "Usage: ./build_prod_model output_dir";
fi
export OUTPUT_DIR="$1"

# Train a new model.
bin/train-model
cat "${OUTPUT_DIR}/eval_output"

MODEL_FILE="$(ls ${OUTPUT_DIR}/*.crfmodel)"
echo "Created new model: $MODEL_FILE"
63 changes: 20 additions & 43 deletions test_e2e
Original file line number Diff line number Diff line change
@@ -1,63 +1,40 @@
#!/bin/bash

# End-to-end test for ingredient-phrase-tagger. Uses generate_data to generate
# training and test data, then verifies that the generated files match up with
# the golden version of pre-generated data.
# End-to-end test for ingredient-phrase-tagger.
#
# Trains a new model using a known set of labelled data then verifies that all
# generated outputs match the golden outputs.

# Exit build script on first failure
set -e

# Echo commands to stdout.
set -x

LABELLED_DATA_FILE=nyt-ingredients-snapshot-2015.csv
LABELLED_EXAMPLE_COUNT=22000
TRAINING_DATA_PERCENT=0.9
COUNT_TRAIN=$(python -c "print int($TRAINING_DATA_PERCENT * $LABELLED_EXAMPLE_COUNT)")
COUNT_TEST=$(python -c "print int((1.0 - $TRAINING_DATA_PERCENT) * $LABELLED_EXAMPLE_COUNT)")
OUTPUT_DIR=$(mktemp -d)
CRF_TRAINING_FILE="${OUTPUT_DIR}/training_data.crf"
CRF_TESTING_FILE="${OUTPUT_DIR}/testing_data.crf"

CRF_LEARN_TEMPLATE=template_file
export LABELLED_DATA_FILE=nyt-ingredients-snapshot-2015.csv
export LABELLED_EXAMPLE_COUNT=22000
export TRAINING_DATA_PERCENT=0.9
# This needs to be explicit so that there is consistent training between
# different machines.
CRF_TRAINING_THREADS=2
CRF_MODEL_FILE="${OUTPUT_DIR}/crf_model"
export CRF_TRAINING_THREADS=2

export OUTPUT_DIR=$(mktemp -d)
ACTUAL_TRAINING_FILE="${OUTPUT_DIR}/training_data.crf"
ACTUAL_TESTING_FILE="${OUTPUT_DIR}/testing_data.crf"
ACTUAL_EVAL_OUTPUT_FILE="${OUTPUT_DIR}/eval_output"

TESTING_OUTPUT_FILE="${OUTPUT_DIR}/testing_output"
EVAL_OUTPUT_FILE="${OUTPUT_DIR}/eval_output"
# Train a new model.
bin/train-model
cat "$ACTUAL_EVAL_OUTPUT_FILE"

# Check against golden output.
GOLDEN_DIR=tests/golden
GOLDEN_CRF_TRAINING_FILE="${GOLDEN_DIR}/training_data.crf"
GOLDEN_CRF_TESTING_FILE="${GOLDEN_DIR}/testing_data.crf"
GOLDEN_EVAL_OUTPUT_FILE="${GOLDEN_DIR}/eval_output"

bin/generate_data \
--data-path="$LABELLED_DATA_FILE" \
--count="$COUNT_TRAIN" \
--offset=0 > "$CRF_TRAINING_FILE"

diff --context=2 "$GOLDEN_CRF_TRAINING_FILE" "$CRF_TRAINING_FILE"

bin/generate_data \
--data-path="$LABELLED_DATA_FILE" \
--count="$COUNT_TEST" \
--offset=$COUNT_TRAIN > "$CRF_TESTING_FILE"

diff --context=2 "$GOLDEN_CRF_TESTING_FILE" "$CRF_TESTING_FILE"

crf_learn \
--thread="$CRF_TRAINING_THREADS" \
"$CRF_LEARN_TEMPLATE" "$CRF_TESTING_FILE" "$CRF_MODEL_FILE"

crf_test \
--model="$CRF_MODEL_FILE" \
"$CRF_TESTING_FILE" > "$TESTING_OUTPUT_FILE"

python bin/evaluate.py "$TESTING_OUTPUT_FILE" > "$EVAL_OUTPUT_FILE"
cat "$EVAL_OUTPUT_FILE"

diff "$GOLDEN_EVAL_OUTPUT_FILE" "$EVAL_OUTPUT_FILE"
diff --context=2 "$GOLDEN_CRF_TRAINING_FILE" "$ACTUAL_TRAINING_FILE"
diff --context=2 "$GOLDEN_CRF_TESTING_FILE" "$ACTUAL_TESTING_FILE"
diff "$GOLDEN_EVAL_OUTPUT_FILE" "$ACTUAL_EVAL_OUTPUT_FILE"

rm -rf $OUTPUT_DIR

0 comments on commit 89ca41d

Please sign in to comment.