In [1]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
recipe_sr = pd.read_pickle('../../data/02_intermediate/recipes_sr_final.pickle')

In [3]:
recipe_sr.columns

Index(['title', 'prep_time', 'cook_time', 'recipe_yield', 'tags',
       'ingredients', 'entire_card', 'recipe_links'],
      dtype='object')

In [4]:
recipe_sr.head()

Unnamed: 0,title,prep_time,cook_time,recipe_yield,tags,ingredients,entire_card,recipe_links
0,Grilled Cheese BLT,10 minutes,10 minutes,4 sandwiches,"'Dinner', 'Lunch', 'Sandwich', 'Favorite Summe...","[8 slices sourdough bread, 4 tablespoon unsalt...","['\n\n ', '\n ...",https://www.simplyrecipes.com/recipes/grilled_...
1,Pulled Pork Sandwich,10 minutes,"2 hours, 45 minutes",Serves 6 to 8,"'Dinner', 'Sandwich', 'Budget', 'Comfort Food'...","[1 large onion, chopped, 6 garlic cloves, peel...","['\n\n ', '\n ...",https://www.simplyrecipes.com/recipes/pulled_p...
2,How to Make Bacon in the Oven,5 minutes,20 minutes,12 strips,"'Tips', 'Breakfast and Brunch', 'Baking', 'How...","[12 strips bacon, 1/2 teaspoon ground black pe...","['\n\n ', '\n ...",https://www.simplyrecipes.com/recipes/how_to_m...
3,Sausage Stuffed Zucchini,15 minutes,1 hour,Serves 4,"'Dinner', 'Favorite Summer', 'Make-ahead', 'It...","[2 tablespoons extra virgin olive oil, 1/2 pou...","['\n\n ', '\n ...",https://www.simplyrecipes.com/recipes/italian_...
4,The Best Dry Rub for Ribs,5 minutes,,,"'Favorite Fall', 'Favorite Summer', 'Game Day'...",[3/4 cup packed dark brown sugar (or 1/2 cup i...,"['\n\n ', '\n ...",https://www.simplyrecipes.com/recipes/the_best...


In [5]:
ingredients_lists = list(recipe_sr.ingredients)

Let's tokenize each ingredient (making sure that the punctuation is gone)

In [6]:
ingredients_lists

[['8 slices sourdough bread',
  '4 tablespoon unsalted butter, at room temperature',
  '8 ounces (2 cups) shredded cheddar cheese',
  '2 slicing tomatoes (such as beefsteak, Brandywine, or Cherokee purple), sliced 1/4-inch thick',
  "8 to 12 slices ', 'cooked bacon",
  '12 leaves butterhead or other crispy lettuce',
  'Kosher salt and black pepper'],
 ['1 large onion, chopped',
  '6 garlic cloves, peeled',
  '1 pickled jalapeño pepper, seeded and chopped',
  '2 teaspoons Chipotle chile powder',
  '1 tablespoon tomato paste',
  '2 tablespoon Dijon mustard',
  '3/4 cup distilled white vinegar',
  '1 teaspoon paprika',
  '1/3 cup ketchup',
  '2 teaspoons Worcestershire sauce',
  '1/4 cup light brown sugar',
  'Salt',
  '1 bay leaf',
  '3 pounds pork butt shoulder roast',
  'Hamburger buns'],
 ['12 strips bacon', '1/2 teaspoon ground black pepper (optional)'],
 ['2 tablespoons extra virgin olive oil',
  '1/2 pound Italian sausage, removed from casing',
  '1 large zucchini, 12 to 14 inches 

In [7]:
# let's tokenize all the words and get rid of punctuation
tokenizer = RegexpTokenizer(r'(\d\/\d |\w+)')
token_sr = []
for recipe in ingredients_lists:
    sub_list = []
    for ingredient in recipe: 
        sub_list.append(tokenizer.tokenize(ingredient))
    token_sr.append(sub_list)

In [8]:
token_sr[5]

[['1', 'tablespoon', 'olive', 'oil'],
 ['1/2 ', 'cup', 'diced', 'white', 'onion'],
 ['1', 'pound', 'ground', 'pork'],
 ['3', 'cloves', 'garlic', 'minced'],
 ['1',
  'tablespoon',
  'grated',
  'fresh',
  'ginger',
  'about',
  '1',
  'inch',
  'peeled'],
 ['1',
  'tablespoon',
  'soy',
  'sauce',
  'use',
  'tamari',
  'to',
  'keep',
  'the',
  'recipe',
  'gluten',
  'free'],
 ['1', 'large', 'carrot', 'shredded'],
 ['1', 'medium', 'cucumber', 'sliced'],
 ['1/3 ', 'cup', 'fresh', 'parsley'],
 ['1/3 ', 'cup', 'fresh', 'basil'],
 ['1/3 ', 'cup', 'fresh', 'mint'],
 ['1', 'cup', 'dried', 'jasmine', 'rice', 'to', 'serve'],
 ['1', 'lime', 'sliced', 'to', 'serve'],
 ['Sweet', 'soy', 'sauce', 'to', 'serve'],
 ['Sesame', 'seeds', 'garnish'],
 ['1', 'pita', 'round'],
 ['2', 'tablespoons', 'Greek', 'yogurt'],
 ['Green', 'leaf', 'lettuce'],
 ['1/3 ', 'cup', 'leftover', 'ginger', 'pork'],
 ['Shredded', 'carrot'],
 ['Sliced', 'cucumber'],
 ['Sriracha', 'optional']]

Let's put this into a form that our crf model can handle

In [27]:
crf_data  = []
index = 0
for recipe in token_sr:
    sub_list = []
    for ingredient in recipe:
        pos = nltk.pos_tag(ingredient)
        sub_list.append((pos))
    crf_data.append(sub_list)
    index = index + 1

In [28]:
crf_data

[[[('8', 'CD'), ('slices', 'NNS'), ('sourdough', 'JJ'), ('bread', 'NN')],
  [('4', 'CD'),
   ('tablespoon', 'NN'),
   ('unsalted', 'JJ'),
   ('butter', 'NN'),
   ('at', 'IN'),
   ('room', 'NN'),
   ('temperature', 'NN')],
  [('8', 'CD'),
   ('ounces', 'NNS'),
   ('2', 'CD'),
   ('cups', 'NNS'),
   ('shredded', 'VBD'),
   ('cheddar', 'NN'),
   ('cheese', 'NN')],
  [('2', 'CD'),
   ('slicing', 'VBG'),
   ('tomatoes', 'NNS'),
   ('such', 'JJ'),
   ('as', 'IN'),
   ('beefsteak', 'JJ'),
   ('Brandywine', 'NNP'),
   ('or', 'CC'),
   ('Cherokee', 'NNP'),
   ('purple', 'NN'),
   ('sliced', 'VBD'),
   ('1', 'CD'),
   ('4', 'CD'),
   ('inch', 'NN'),
   ('thick', 'NN')],
  [('8', 'CD'),
   ('to', 'TO'),
   ('12', 'CD'),
   ('slices', 'NNS'),
   ('cooked', 'VBD'),
   ('bacon', 'NN')],
  [('12', 'CD'),
   ('leaves', 'NNS'),
   ('butterhead', 'VBP'),
   ('or', 'CC'),
   ('other', 'JJ'),
   ('crispy', 'NN'),
   ('lettuce', 'NN')],
  [('Kosher', 'NNP'),
   ('salt', 'NN'),
   ('and', 'CC'),
   ('black'

### Let's make some features

In [62]:
crf_data[0][0][0]

('8', 'CD')

In [71]:
i = 0
for doc in crf_data:
    print(doc[0][0][0])
    print('\n')
#     word = doc[i][0]
#     postag = doc[i][1]
#     i = i + 1

8


1


12


2


3/4 


1


1


1


1


1


2


2


12


12


1


2


15


2


1


1


8


4


2


5


4


4


1


3


1


1


Small


6


2




IndexError: list index out of range

In [72]:
def word2features(data, i):
    for doc in data: 
        word = doc[i][i][0]
        postag = doc[i][i][1]
        srindex = 0

        # Common features for all words
        features = [
            'bias',
            'word.lower=' + word.lower(),
            'word[-3:]=' + word[-3:],
            'word[-2:]=' + word[-2:],
            'word.istitle=%s' % word.istitle(),
            'postag=' + postag, 
            'srindex=' + str(srindex),
        ]

        # Features for words that are not
        # at the beginning of a document
        if i > 0:
            word1 = doc[i-1][0]
            postag1 = doc[i-1][1]
            features.extend([
                '-1:word.lower=' + word1.lower(),
                '-1:word.istitle=%s' % word1.istitle(),
                '-1:postag=' + postag1
            ])
        else:
            # Indicate that it is the 'beginning of a document'
            features.append('BOS')

        # Features for words that are not
        # at the end of a document
        if i < len(doc)-1:
            word1 = doc[i+1][0]
            postag1 = doc[i+1][1]
            features.extend([
                '+1:word.lower=' + word1.lower(),
                '+1:word.istitle=%s' % word1.istitle(),
                '+1:postag=' + postag1
            ])
        else:
            # Indicate that it is the 'end of a document'
            features.append('EOS')
            
        srindex = srindex + 1

        return features

In [73]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

In [74]:
X = [extract_features(doc) for doc in crf_data]

IndexError: string index out of range