# Tagging Recipes - Simply Recipes 

In [1]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
import numpy as np
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Read in Simply Recipe and take a look at what we have

In [95]:
recipe_sr = pd.read_pickle('../../data/02_intermediate/recipes_sr_final.pickle')

In [96]:
recipe_sr.head()

Unnamed: 0,title,prep_time,cook_time,recipe_yield,tags,ingredients,byline,link_food
0,Grilled Cheese BLT,10 minutes,10 minutes,4 sandwiches,"['Dinner', 'Lunch', 'Sandwich', 'Favorite Summ...","[8 slices sourdough bread', '4 tablespoon unsa...",Aaron Hutcherson,https://www.simplyrecipes.com/recipes/grilled_...
1,Pulled Pork Sandwich,10 minutes,"2 hours, 45 minutes",Serves 6 to 8,"['Dinner', 'Sandwich', 'Budget', 'Comfort Food...",[],Elise Bauer,https://www.simplyrecipes.com/recipes/pulled_p...
2,How to Make Bacon in the Oven,5 minutes,20 minutes,12 strips,"['Tips', 'Breakfast and Brunch', 'Baking', 'Ho...","[12 strips bacon', '1/2 teaspoon ground black ...",Nick Evans,https://www.simplyrecipes.com/recipes/how_to_m...
3,Sausage Stuffed Zucchini,15 minutes,1 hour,Serves 4,"['Dinner', 'Favorite Summer', 'Make-ahead', 'I...","[2 tablespoons extra virgin olive oil', '1/2 p...",Elise Bauer,https://www.simplyrecipes.com/recipes/italian_...
4,The Best Dry Rub for Ribs,5 minutes,,,"['Favorite Fall', 'Favorite Summer', 'Game Day...",[3/4 cup packed dark brown sugar (or 1/2 cup i...,Irvin Lin,https://www.simplyrecipes.com/recipes/the_best...


Although our recipes are clean, we are not able to search through this list in any meaningful way in order to figure out what ingredients are in each recipe. We need to figure out how to turn each unstructured ingredient (i.e., '8 slices sourdough bread' to something more structured like quantity: 8, unit: slices, food: sourdough bread). For this task we are going to need natural language processing. 

## Tagging Ingredients through a CRF Model 

Through a great deal of research I have found that this issue is often solved through building and deploying a CRF model. The New York Times has some wonderful resources on how to deploy this model, and using parts of their model could be really helpful in figuring out how to tag my data. 

In [113]:
ingredients_lists = list(recipe_sr.ingredients)

In [114]:
ingredients_lists

[["8 slices sourdough bread', '4 tablespoon unsalted butter, at room temperature', '8 ounces (2 cups) shredded cheddar cheese', '2 slicing tomatoes (such as beefsteak, Brandywine, or Cherokee purple), sliced 1/4-inch thick', '8 to 12 slices ', 'cooked bacon', '12 leaves butterhead or other crispy lettuce"],
 [],
 ["12 strips bacon', '1/2 teaspoon ground black pepper (optional)', ' Special equipment:', 'Wire baking rack"],
 ["2 tablespoons extra virgin olive oil', '1/2 pound Italian sausage, removed from casing', '1 large zucchini, 12 to 14 inches long (about 1 1/4 to 2 pounds) or 3 to 4 medium zucchini', '1 cup chopped onion', '3 cloves garlic, finely chopped', '2 slices bread, pulsed in food processor or blender to make fresh breadcrumbs (about 1 cup)', '2 medium tomatoes, chopped', '1/4 cup chopped fresh basil', '1/2 teaspoon dried oregano, or 2 teaspoons chopped fresh oregano', '3/4 cup shredded Parmesan cheese', '1 large egg, lightly beaten', '3/4 teaspoon salt (less or more to tas

In [127]:
ingredient_list_new = []
for recipe in ingredients_lists:
    sub_list = []
    for ingredient in recipe:
        ingredient_list_new.append(ingredient.split(','))

In [128]:
ingredient_list_new

[["8 slices sourdough bread'",
  " '4 tablespoon unsalted butter",
  " at room temperature'",
  " '8 ounces (2 cups) shredded cheddar cheese'",
  " '2 slicing tomatoes (such as beefsteak",
  ' Brandywine',
  ' or Cherokee purple)',
  " sliced 1/4-inch thick'",
  " '8 to 12 slices '",
  " 'cooked bacon'",
  " '12 leaves butterhead or other crispy lettuce"],
 ["12 strips bacon'",
  " '1/2 teaspoon ground black pepper (optional)'",
  " ' Special equipment:'",
  " 'Wire baking rack"],
 ["2 tablespoons extra virgin olive oil'",
  " '1/2 pound Italian sausage",
  " removed from casing'",
  " '1 large zucchini",
  " 12 to 14 inches long (about 1 1/4 to 2 pounds) or 3 to 4 medium zucchini'",
  " '1 cup chopped onion'",
  " '3 cloves garlic",
  " finely chopped'",
  " '2 slices bread",
  " pulsed in food processor or blender to make fresh breadcrumbs (about 1 cup)'",
  " '2 medium tomatoes",
  " chopped'",
  " '1/4 cup chopped fresh basil'",
  " '1/2 teaspoon dried oregano",
  " or 2 teaspoons 

In [118]:
ingredients_lists[0]

"8 slices sourdough bread', '4 tablespoon unsalted butter, at room temperature', '8 ounces (2 cups) shredded cheddar cheese', '2 slicing tomatoes (such as beefsteak, Brandywine, or Cherokee purple), sliced 1/4-inch thick', '8 to 12 slices ', 'cooked bacon', '12 leaves butterhead or other crispy lettuce"

Let's tokenize each ingredient and make sure that the punctuation is removed

In [7]:
# let's tokenize all the words and get rid of punctuation
tokenizer = RegexpTokenizer(r'(\d\/\d |\w+)')
token_sr = []
for recipe in ingredients_lists:
    sub_list = []
    for ingredient in recipe: 
        sub_list.append(tokenizer.tokenize(ingredient))
    token_sr.append(sub_list)

In [8]:
len(token_sr)

1748

### Feature Creation 

Now that we have tokenized our ingredients we need to put this in a form that our CRF model can handle (this means grouping recipe ingredients into a list and grouping each ingredient sentence into it's own list with tuples). 

In [9]:
crf_data  = []
index = 0
for recipe in token_sr:
    sub_list = []
    for ingredient in recipe:
        pos = nltk.pos_tag(ingredient)
        sub_list.append((pos))
    crf_data.append(sub_list)
    index = index + 1

In [10]:
len(crf_data)

1748

In [11]:
# Source: NYT Github Page 
def singularize(word):
    """
    A poor replacement for the pattern.en singularize function, but ok for now.
    """

    units = {
        "cups": u"cup",
        "tablespoons": u"tablespoon",
        "teaspoons": u"teaspoon",
        "pounds": u"pound",
        "ounces": u"ounce",
        "cloves": u"clove",
        "sprigs": u"sprig",
        "pinches": u"pinch",
        "bunches": u"bunch",
        "slices": u"slice",
        "grams": u"gram",
        "heads": u"head",
        "quarts": u"quart",
        "stalks": u"stalk",
        "pints": u"pint",
        "pieces": u"piece",
        "sticks": u"stick",
        "dashes": u"dash",
        "fillets": u"fillet",
        "cans": u"can",
        "ears": u"ear",
        "packages": u"package",
        "strips": u"strip",
        "bulbs": u"bulb",
        "bottles": u"bottle"
    }

    if word in units.keys():
        return units[word]
    else:
        return word

In [12]:
def word2features(doc, i):
    word = singularize(doc[i][0])
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'postag=' + postag,
    ]
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')
    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [13]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [14]:
crf_data_final = []
for recipe in crf_data:
    X = [extract_features(doc) for doc in recipe]
    crf_data_final.append(X)

Import our tagger

In [15]:
tagger = pycrfsuite.Tagger()
tagger.open('../../data/04_models/crf_ing_final.model')

<contextlib.closing at 0x1a18e12e10>

Let's tag the simply recipes dataset

In [16]:
sr_labels = []
for recipe in crf_data_final:
    y_pred = [tagger.tag(xseq) for xseq in recipe]
    sr_labels.append(y_pred)

In [17]:
len(sr_labels)

1748

In [18]:
recipe_titles = list(recipe_sr.title)

In [19]:
len(recipe_titles)

1748

In [20]:
len(token_sr)

1748

In [21]:
len(sr_labels)

1748

In [29]:
links = list(recipe_sr.link_food)

In [31]:
len(links)

1748

### Let's match up the tokens with their tags

One idea is to create each recipe as a nested dictionary

In [83]:
def ingredient_tagger(ingredient_sentence, ingredient_label_sentence):
    qty = []
    unit = []
    name = []
    comment = []

    for word, label in zip(ingredient_sentence, ingredient_label_sentence):
        if label == 'qty':
            qty.append(word)
        if label == 'unit':
            unit.append(word)
        if label == 'name':
            name.append(word)
        if label == 'comment':
            comment.append(word)
    return {'qty': " ".join(qty), 'unit': " ".join(unit), 'name': " ".join(name), 'comment': " ".join(comment)}

In [84]:
def recipe_tagger(single_recipe, matching_recipe_labels):
    ret = []
    for ingredient, ingredient_label in zip(single_recipe, matching_recipe_labels):
        ret.append(ingredient_tagger(ingredient, ingredient_label))
    return ret

In [89]:
def token_labels_to_dict(tokens, labels, recipe_titles, links):
    recipe_dict = {}
    recipe_link_dict = {}
    for recipe, label, title, link in zip(tokens, labels, recipe_titles, links):
        ing = recipe_tagger(recipe, label)
        recipe_dict[str(title).lower()] = ing
        recipe_link_dict[str(title).lower()] = link
    return [recipe_dict, recipe_link_dict]

In [90]:
recipe_dict_sr, recipe_links_sr = token_labels_to_dict(token_sr, sr_labels, recipe_titles, links)

In [94]:
recipe_dict_sr['tuna and tomato pasta']

[{'qty': '4',
  'unit': 'tablespoons',
  'name': 'unsalted butter',
  'comment': '1 28 ounce can of tomatoes whole or crushed Salt 1 pound pasta shells 1/2  cup ricotta cheese 1 6 ounce can tuna packed in olive oil 1/4  cup fresh basil chopped or torn'}]

In [92]:
recipe_links_sr['grilled cheese blt']

'https://www.simplyrecipes.com/recipes/grilled_cheese_blt/'

In [None]:
outfile = open('../../data/03_processed/crf_ingred_dict','wb')
pickle.dump(sr_ingredients_dict, outfile)
outfile.close()