In [99]:
import nltk
import pycrfsuite
import numpy as np

import random
from string import punctuation

import re

from itertools import chain

from nltk.tokenize import PunktSentenceTokenizer
import utils

This code is reimplementation of the original NYT Ingredient Phrase tagger(https://github.com/NYTimes/ingredient-phrase-tagger).

as you will see, the NYT's implementation uses CRF++ extractor to extract training data(You can see the readme)

I used the same for extracting the training data as accepted by CRF++.
But hear, the pycrfsuite accepts data in the form of continous list.
So we have to convert that input-data into a format suitable for prfsuite

In [100]:
# you can see the readme in the above link about how to generate the text file
with open('train_file') as fname:
    lines = fname.readlines()
    items = [line.strip('\n').split('\t') for line in lines]
    items = [item for item in items if len(item)==6]

In [101]:
#PyCRFSuite expects a list of input sequences. 
#So we process the items from the train file 
#and bucket them into sentences

sentences = []

sent = [items[0]]
for item in items[1:]:
    if 'I1' in item:
        sentences.append(sent)
        sent = [item]
    else:
        sent.append(item)

sentences = sentences[:50000]

#randomly shuffling the data is a good practice  before training
random.shuffle(sentences)

sentences[0]

[['Small', 'I1', 'L12', 'YesCAP', 'NoPAREN', 'B-COMMENT'],
 ['amount', 'I2', 'L12', 'NoCAP', 'NoPAREN', 'I-COMMENT'],
 ['of', 'I3', 'L12', 'NoCAP', 'NoPAREN', 'I-COMMENT'],
 ['butter', 'I4', 'L12', 'NoCAP', 'NoPAREN', 'B-NAME'],
 ['and', 'I5', 'L12', 'NoCAP', 'NoPAREN', 'I-NAME'],
 ['flour', 'I6', 'L12', 'NoCAP', 'NoPAREN', 'I-NAME'],
 ['for', 'I7', 'L12', 'NoCAP', 'NoPAREN', 'B-COMMENT'],
 ['preparing', 'I8', 'L12', 'NoCAP', 'NoPAREN', 'I-COMMENT'],
 ['baking', 'I9', 'L12', 'NoCAP', 'NoPAREN', 'I-COMMENT'],
 ['sheet', 'I10', 'L12', 'NoCAP', 'NoPAREN', 'I-COMMENT']]

In [102]:
train_test_split_value = int(0.1 * len(sentences))

test_data = sentences[:train_test_split_value]
train_data = sentences[train_test_split_value:]

#we will sepatate labels, features and tokens from the data
def create_labels(sent):
    return [word[-1] for word in sent]

def create_features(sent):
    return [word[:-1] for word in sent]

def create_tokens(sent):
    return [word[0] for word in sent]   

y_train = [create_labels(s) for s in train_data]
X_train = [create_features(s) for s in train_data]

X_train[0], y_train[0]

([['6', 'I1', 'L8', 'NoCAP', 'NoPAREN'],
  ['unpeeled', 'I2', 'L8', 'NoCAP', 'NoPAREN'],
  ['garlic', 'I3', 'L8', 'NoCAP', 'NoPAREN'],
  ['cloves', 'I4', 'L8', 'NoCAP', 'NoPAREN']],
 ['B-QTY', 'B-COMMENT', 'B-NAME', 'B-UNIT'])

In [103]:
# NOw we define The trainer class.
# This class maintains a data set for training, and provides an interface to various training algorithms.
trainer = pycrfsuite.Trainer(verbose=False)

# Append an instance (item/label sequence) to the data set.
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

#this are the tuned hyperparameters for the training which I found from the internet because of lack of time
trainer.set_params(
{
        'c1': 0.43,
        'c2': 0.012,
        'max_iterations': 100,
        'feature.possible_transitions': True,
        'feature.possible_states': True,
        'linesearch': 'StrongBacktracking'
    }
)

#Run the training algorithm. This function starts the training 
# and saves the trained model to "trained_pycrfsuite"
#so that we don't have to train it again as the training process is slow
#depending on the data size
trainer.train('trained_pycrfsuite')

In [104]:
#load the trained model
#to do this we use taggers

tagger = pycrfsuite.Tagger()
tagger.open('trained_pycrfsuite')

<contextlib.closing at 0x7fea26384390>

In [110]:
tokenizer = PunktSentenceTokenizer()

def get_sentence_features(sent):
#     Gets  the features of the sentence
    sent_tokens = utils.tokenize(utils.cleanUnicodeFractions(sent))

    sent_features = []
    for i, token in enumerate(list(sent_tokens)):
        token_features = [token]
        token_features.extend(utils.getFeatures(token, i+1, list(sent_tokens)))
        sent_features.append(token_features)
    return sent_features

def format_ingredient_output(tagger_output, display=False):
#     """Formats the tagger output into a more convenient dictionary"""
    data = [{}]
    display = [[]]
    prevTag = None


    for token, tag in tagger_output:
    # turn B-NAME/123 back into "name"
        tag = re.sub(r'^[BI]\-', "", tag).lower()

        # ---- DISPLAY ----
        # build a structure which groups each token by its tag, so we can
        # rebuild the original display name later.

        if prevTag != tag:
            display[-1].append((tag, [token]))
            prevTag = tag
        else:
            display[-1][-1][1].append(token)
            #               ^- token
            #            ^---- tag
            #        ^-------- ingredient

            # ---- DATA ----
            # build a dict grouping tokens by their tag

            # initialize this attribute if this is the first token of its kind
        if tag not in data[-1]:
            data[-1][tag] = []

        # HACK: If this token is a unit, singularize it so Scoop accepts it.
        if tag == "unit":
            token = utils.singularize(token)

        data[-1][tag].append(token)

    # reassemble the output into a list of dicts.
    output = [
        dict([(k, utils.smartJoin(tokens)) for k, tokens in ingredient.items()])
        for ingredient in data
        if len(ingredient)
    ]

    # Add the raw ingredient phrase
    for i, v in enumerate(output):
        output[i]["input"] = utils.smartJoin(
            [" ".join(tokens) for k, tokens in display[i]])

    return output

def parse_ingredient(sent):
#     """ingredient parsing logic"""
    sentence_features = get_sentence_features(sent)
    tags = tagger.tag(sentence_features)
    tagger_output = zip(create_tokens(sentence_features), tags)
    parsed_ingredient =  format_ingredient_output(tagger_output)
    if parsed_ingredient:
        parsed_ingredient[0]['name'] = parsed_ingredient[0].get('name','').strip('.')
    return parsed_ingredient

def parse_recipe_ingredients(ingredient_list):

#     """Wrapper around parse_ingredient so we can call it on an ingredient list"""
    sentences = tokenizer.tokenize(q)
    
    sentences = [sent.strip('\n') for sent in sentences]
    ingredients = []
    for sent in sentences:
        ingredients.extend(parse_ingredient(sent))
    return ingredients

In [111]:
# Now we will pass some examples and check whether the code we wrote is working perfectly
# you can see from the output that its extracting the data accurately

q = '''
2 1/4 cups all-purpose flour.
1/2 teaspoon baking soda.
1 cup (2 sticks) unsalted butter, room temperature.
1/2 cup granulated sugar.
1 cup packed light-brown sugar.
1 teaspoon salt.
2 teaspoons pure vanilla extract.
2 large eggs.
2 cups (about 12 ounces) semisweet and/or milk chocolate chips.
'''

parse_recipe_ingredients(q)




# '''
# The output is in the format
# [{ 'comment': extra comments #if available#
#   'input': input we are passing to the extractor
#   'name': name of the item ordered
#   'qty': quantity required of the ordered item
#   'unit': unit
#  }

# ]
# '''

[{'input': '2$1/4 cups all-purpose flour.',
  'name': 'all-purpose flour',
  'qty': '2$1/4',
  'unit': 'cup'},
 {'input': '1/2 teaspoon baking soda.',
  'name': 'baking soda',
  'qty': '1/2',
  'unit': 'teaspoon'},
 {'comment': '(2 sticks) room temperature.',
  'input': '1 cup (2 sticks) unsalted butter, room temperature.',
  'name': 'unsalted butter',
  'other': ',',
  'qty': '1',
  'unit': 'cup'},
 {'comment': 'granulated',
  'input': '1/2 cup granulated sugar.',
  'name': 'sugar',
  'qty': '1/2',
  'unit': 'cup'},
 {'input': '1 cup packed light-brown sugar.',
  'name': 'packed light-brown sugar',
  'qty': '1',
  'unit': 'cup'},
 {'input': '1 teaspoon salt.', 'name': 'salt', 'qty': '1', 'unit': 'teaspoon'},
 {'comment': 'pure',
  'input': '2 teaspoons pure vanilla extract.',
  'name': 'vanilla extract',
  'qty': '2',
  'unit': 'teaspoon'},
 {'comment': 'large', 'input': '2 large eggs.', 'name': 'eggs', 'qty': '2'},
 {'comment': '(about) semisweet and/or',
  'input': '2 cups (about 12