# Feature Engineering - Simply Recipes Ingredients for CRF

In [22]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
import numpy as np
import re
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## NYT CRF MODEL

In [2]:
nyt_ing = pd.read_csv('../../data/01_raw/nyt-ingredients-snapshot-2015.csv')
nyt_ing.drop(columns=['index'], inplace=True)

In [3]:
nyt_ing.head()

Unnamed: 0,input,name,qty,range_end,unit,comment
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.0,0.0,stalk,chopped coarse
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,0.0,tablespoon,


In [4]:
ingredients_list = list(nyt_ing.input)
ingredients_list_new = []
for ingredient in ingredients_list:
    try: 
        ing_update = re.sub(r'(\d+)\s+(\d)/(\d)', r'\1$\2/\3', ingredient)
        ingredients_list_new.append(ing_update.split(" "))
    except:
        ingredients_list_new.append(np.nan)

In [16]:
input_list = list(nyt_ing.input)
name_list = list(nyt_ing.name)
qty_list = list(nyt_ing.qty)
unit_list = list(nyt_ing.unit)
comment_list = list(nyt_ing.comment)

Let's add the dollar sign to the input list

In [27]:
input_list_new = []
for inp in input_list:
    try:
        input_list_new.append(re.sub(r'(\d+)\s+(\d)/(\d)', r'\1$\2/\3', inp))
    except:
        input_list_new.append(np.nan)

In [28]:
ing_dict = {}
for inpu, name, qty, unt, comment in zip(input_list_new, name_list, qty_list, unit_list, comment_list):
    ing_dict.update({inpu: {'name':name, 'qty':qty, 'unit':unt, 'comment':comment}})

In [29]:
ing_dict

{'1$1/4 cups cooked and pureed fresh butternut squash, or 1 10-ounce package frozen squash, defrosted': {'name': 'butternut squash',
  'qty': 1.25,
  'unit': 'cup',
  'comment': 'cooked and pureed fresh, or 1 10-ounce package frozen squash, defrosted'},
 '1 cup peeled and cooked fresh chestnuts (about 20), or 1 cup canned, unsweetened chestnuts': {'name': 'chestnuts',
  'qty': 1.0,
  'unit': 'cup',
  'comment': 'peeled and cooked fresh (about 20), or 1 cup canned, unsweetened'},
 '1 medium-size onion, peeled and chopped': {'name': 'onion',
  'qty': 1.0,
  'unit': nan,
  'comment': 'medium-size, peeled and chopped'},
 '2 stalks celery, chopped coarse': {'name': 'celery',
  'qty': 2.0,
  'unit': 'stalk',
  'comment': 'chopped coarse'},
 '1$1/2 tablespoons vegetable oil': {'name': 'vegetable oil',
  'qty': 1.5,
  'unit': 'tablespoon',
  'comment': nan},
 nan: {'name': nan, 'qty': 0.0, 'unit': nan, 'comment': nan},
 '2 tablespoons unflavored gelatin, dissolved in 1/2 cup water': {'name': '

---

## CRF TUTORIAL: Performing Sequence Labelling using CRF in Python

In [7]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

# Read data file and parse the XML
with codecs.open("../../data/tutorials/reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

### Put Data in form that CRF likes it

In [8]:
docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

In [9]:
docs

[[('Paxar', 'N'),
  ('Corp', 'N'),
  ('said', 'I'),
  ('it', 'I'),
  ('has', 'I'),
  ('acquired', 'I'),
  ('Thermo-Print', 'N'),
  ('GmbH', 'N'),
  ('of', 'I'),
  ('Lohn', 'N'),
  (',', 'I'),
  ('West', 'N'),
  ('Germany', 'N'),
  (',', 'I'),
  ('a', 'I'),
  ('distributor', 'I'),
  ('of', 'I'),
  ('Paxar', 'N'),
  ('products,', 'I'),
  ('for', 'I'),
  ('undisclosed', 'I'),
  ('terms.', 'I')],
 [('Key', 'N'),
  ('Tronic', 'N'),
  ('corp', 'N'),
  ('said', 'I'),
  ('it', 'I'),
  ('has', 'I'),
  ('received', 'I'),
  ('contracts', 'I'),
  ('to', 'I'),
  ('provide', 'I'),
  ('seven', 'I'),
  ('original', 'I'),
  ('equipment', 'I'),
  ('manufacturers', 'I'),
  ('with', 'I'),
  ('which', 'I'),
  ('it', 'I'),
  ('has', 'I'),
  ('not', 'I'),
  ('done', 'I'),
  ('business', 'I'),
  ('recently', 'I'),
  ('with', 'I'),
  ('over', 'I'),
  ('300,000', 'I'),
  ('computer', 'I'),
  ('keyboards', 'I'),
  ('for', 'I'),
  ('delivery', 'I'),
  ('within', 'I'),
  ('the', 'I'),
  ('next', 'I'),
  ('12', 'I'

In [10]:
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]
    
    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

### Construct Features

In [11]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [12]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [13]:
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13282
Seconds required: 0.061

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5627.471563
Feature norm: 1.000000
Error norm: 6199.165004
Active features: 12854
Line search trials: 1
Line search step: 0.000044
Seconds required for this iteration: 0.008

***** Iteration #2 *****
Loss: 4505.649002
Feature norm: 0.841717
Error norm: 5553.406911
Active features: 12970
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #3 *****
Loss: 3979.210408
Feature norm: 0.819765
Error norm: 12229.037236
Active features: 8662
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

In [15]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

donald (N)
trump (N)
and (N)
interstate (N)
properties (N)
said (I)
they (I)
were (I)
holding (I)
preliminary (I)
discussions (I)
regarding (I)
a (I)
possible (I)
joint (I)
acquisition (I)
of (I)
alexanders (N)
inc (N)
at (I)
47 (I)
dlrs (I)
per (I)
share. (I)
the (I)
possible (I)
acquisition (I)
is (I)
subject (I)
to (I)
any (I)
applicable (I)
real (I)
estate (I)
gains (I)
and (I)
transfer (I)
taxes, (I)
the (I)
joint (I)
statement (I)
said. (I)
trump (I)
and (I)
interstate (N)
, (I)
which (I)
presently (I)
own (I)
about (I)
40 (I)
pct (I)
of (I)
alexanders (N)
common (I)
stock, (I)
said (I)
they (I)
intend (I)
to (I)
keep (I)
the (I)
company (I)
as (I)
a (I)
retailer (I)
if (I)
they (I)
succed (I)
in (I)
their (I)
acquisition. (I)
there (I)
can (I)
be (I)
no (I)
assurances (I)
that (I)
the (I)
parties (I)
will (I)
reach (I)
any (I)
agreement (I)
regarding (I)
an (I)
acquisition (I)
or (I)
what (I)
price (I)
might (I)
be (I)
offered, (I)
the (I)
statement (I)
said. (I)


**SOURCE:**
* <font color='red'>Performing Sequence Labelling using CRF in Python</font>
* https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.crf
* http://www.nltk.org/book/ch00.html
* https://python-crfsuite.readthedocs.io/en/latest/
* https://open.nytimes.com/
* CRF Suite Tutorial: http://www.chokkan.org/software/crfsuite/tutorial.html
* sklearn_crfsuite tutorial: https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* NYT Ingredients Parser: https://github.com/nytimes/ingredient-phrase-tagger
* https://github.com/kulsoom-abdullah/kulsoom-abdullah.github.io/tree/master/AWS-lambda-implementation
* End to End Recipe Cuisine Classification: https://towardsdatascience.com/https-towardsdatascience-com-end-to-end-recipe-cuisine-classification-e97f4ac22104
* Performing Sequence Labelling using CRF in Python: http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/