# Feature Engineering - Simply Recipes Ingredients for CRF

In [1]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
import numpy as np
import re
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## NYT CRF MODEL

In [2]:
nyt_ing = pd.read_csv('../../data/01_raw/nyt-ingredients-snapshot-2015.csv')
nyt_ing.drop(columns=['index'], inplace=True)

In [3]:
nyt_ing.head()

Unnamed: 0,input,name,qty,range_end,unit,comment
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.0,0.0,stalk,chopped coarse
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,0.0,tablespoon,


In [4]:
nyt_ing.fillna("missing", inplace=True)

Let's make ingredients list into tokens

In [5]:
ingredients_list = list(nyt_ing.input)
ingredients_list_new = []
for ingredient in ingredients_list:
    try: 
        ing_update = re.sub(r'(\d+)\s+(\d)/(\d)', r'\1$\2/\3', ingredient)
        ingredients_list_new.append(ing_update.split(" "))
    except:
        ingredients_list_new.append(np.nan)

Let's build a dictionary of the nyt dataset

In [6]:
# input_list = list(nyt_ing.input)
# name_list = list(nyt_ing.name)
# qty_list = list(nyt_ing.qty)
# unit_list = list(nyt_ing.unit)
# comment_list = list(nyt_ing.comment)

In [7]:
# # Let's add the dollar sign to the input list
# input_list_new = []
# for i in ingredients_list_new:
#     try:
#         input_list_new.append(re.sub(r'(\d+)\s+(\d)/(\d)', r'\1$\2/\3', inp))
#     except:
#         input_list_new.append(np.nan)

In [8]:
def singularize(word):
    """
    A poor replacement for the pattern.en singularize function, but ok for now.
    """

    units = {
        "cups": u"cup",
        "tablespoons": u"tablespoon",
        "teaspoons": u"teaspoon",
        "pounds": u"pound",
        "ounces": u"ounce",
        "cloves": u"clove",
        "sprigs": u"sprig",
        "pinches": u"pinch",
        "bunches": u"bunch",
        "slices": u"slice",
        "grams": u"gram",
        "heads": u"head",
        "quarts": u"quart",
        "stalks": u"stalk",
        "pints": u"pint",
        "pieces": u"piece",
        "sticks": u"stick",
        "dashes": u"dash",
        "fillets": u"fillet",
        "cans": u"can",
        "ears": u"ear",
        "packages": u"package",
        "strips": u"strip",
        "bulbs": u"bulb",
        "bottles": u"bottle"
    }

    if word in units.keys():
        return units[word]
    else:
        return word

I can't make the name, quantity and comment match up with the right word. let's reevaluation what I'm doing. Do I really need to do this this way? 

### let's try and make a tuple list from this dataframe instead

In [9]:
nyt_ing_tuple = []
for index, row in nyt_ing.iterrows():
    name = row[1]
    qty = row[2]
    unit = row[4]
    comment = row[5]
    nyt_ing_tuple.append([(qty, 'qty'), (unit, 'unit'), (name, 'name'), (comment, 'comment')])

Let's break up the tuples with multiple words. 

In [10]:
nyt_ing_tuple_new = []
for sub_list in nyt_ing_tuple: 
    sub_ls = []
    for elem in sub_list:
        if " " in str(elem[0]):
            elem2 = elem[0].split(" ")
            for el in elem2:
                sub_ls.append((el, elem[1]))
        else: 
            sub_ls.append((elem[0], elem[1]))
    nyt_ing_tuple_new.append(sub_ls)

Let's erase the tuples with missing as a value

In [20]:
for idx_big, ingredient in enumerate(nyt_ing_tuple_new):
    for idx, word in enumerate(ingredient):
        if word[0] == 'missing':
            nyt_ing_tuple_new[idx_big].remove(word)

Strip all punctuation from the tuple

In [72]:
crf_data = []
for sub_list in nyt_ing_tuple_new:
    sublist = []
    for word in sub_list:
        word2 = str(word[0]).strip(')!,.?(')
        sublist.append((word2, word[1]))
    crf_data.append(sublist)

In [75]:
crf_data

[[('1.25', 'qty'),
  ('cup', 'unit'),
  ('butternut', 'name'),
  ('squash', 'name'),
  ('cooked', 'comment'),
  ('and', 'comment'),
  ('pureed', 'comment'),
  ('fresh', 'comment'),
  ('or', 'comment'),
  ('1', 'comment'),
  ('10-ounce', 'comment'),
  ('package', 'comment'),
  ('frozen', 'comment'),
  ('squash', 'comment'),
  ('defrosted', 'comment')],
 [('1.0', 'qty'),
  ('cup', 'unit'),
  ('chestnuts', 'name'),
  ('peeled', 'comment'),
  ('and', 'comment'),
  ('cooked', 'comment'),
  ('fresh', 'comment'),
  ('about', 'comment'),
  ('20', 'comment'),
  ('or', 'comment'),
  ('1', 'comment'),
  ('cup', 'comment'),
  ('canned', 'comment'),
  ('unsweetened', 'comment')],
 [('1.0', 'qty'),
  ('onion', 'name'),
  ('medium-size', 'comment'),
  ('peeled', 'comment'),
  ('and', 'comment'),
  ('chopped', 'comment')],
 [('2.0', 'qty'),
  ('stalk', 'unit'),
  ('celery', 'name'),
  ('chopped', 'comment'),
  ('coarse', 'comment')],
 [('1.5', 'qty'),
  ('tablespoon', 'unit'),
  ('vegetable', 'name'),

---

---

## CRF TUTORIAL: Performing Sequence Labelling using CRF in Python

In [None]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

# Read data file and parse the XML
with codecs.open("../../data/tutorials/reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

### Put Data in form that CRF likes it

In [None]:
docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

In [None]:
docs

In [None]:
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]
    
    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

### Construct Features

In [None]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [None]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [None]:
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

**SOURCE:**
* <font color='red'>Performing Sequence Labelling using CRF in Python</font>
* https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.crf
* http://www.nltk.org/book/ch00.html
* https://python-crfsuite.readthedocs.io/en/latest/
* https://open.nytimes.com/
* CRF Suite Tutorial: http://www.chokkan.org/software/crfsuite/tutorial.html
* sklearn_crfsuite tutorial: https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* NYT Ingredients Parser: https://github.com/nytimes/ingredient-phrase-tagger
* https://github.com/kulsoom-abdullah/kulsoom-abdullah.github.io/tree/master/AWS-lambda-implementation
* End to End Recipe Cuisine Classification: https://towardsdatascience.com/https-towardsdatascience-com-end-to-end-recipe-cuisine-classification-e97f4ac22104
* Performing Sequence Labelling using CRF in Python: http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/