# Feature Engineering - Simply Recipes Ingredients for CRF

In [113]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## NYT CRF MODEL

In [108]:
nyt_ing = pd.read_csv('../../data/01_raw/nyt-ingredients-snapshot-2015.csv')
nyt_ing.drop(columns=['index'], inplace=True)

In [109]:
nyt_ing.head()

Unnamed: 0,input,name,qty,range_end,unit,comment
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.0,0.0,stalk,chopped coarse
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,0.0,tablespoon,


In [126]:
docs = []
ingredients = list(nyt_ing.input)



In [127]:
ingredients

['1 1/4 cups cooked and pureed fresh butternut squash, or 1 10-ounce package frozen squash, defrosted',
 '1 cup peeled and cooked fresh chestnuts (about 20), or 1 cup canned, unsweetened chestnuts',
 '1 medium-size onion, peeled and chopped',
 '2 stalks celery, chopped coarse',
 '1 1/2 tablespoons vegetable oil',
 nan,
 '2 tablespoons unflavored gelatin, dissolved in 1/2 cup water',
 'Salt',
 '1 cup canned plum tomatoes with juice',
 '6 cups veal or beef stock',
 '1/3 cup Worcestershire sauce',
 '1 tablespoon Louisiana hot sauce',
 '1/2 teaspoon hot red pepper flakes',
 '4 bay leaves',
 '6 cloves garlic, crushed and chopped',
 '2 carrots, peeled and diced',
 '2 medium onions, diced',
 '6 tablespoons butter',
 '1 tablespoon Creole seasoning, or other seasonings of your choice',
 '3 pounds beef brisket',
 '1/2 cup fine dry bread crumbs',
 'Freshly ground black pepper',
 '12 oysters and their liquor',
 '3 tablespoons minced scallions',
 '1 tablespoon flour',
 '6 ounces button mushrooms, t

## CRF TUTORIAL: Performing Sequence Labelling using CRF in Python

In [None]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

# Read data file and parse the XML
with codecs.open("../../data/tutorials/reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

### Put Data in form that CRF likes it

In [38]:
docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

In [110]:
docs

[[('Paxar', 'N'),
  ('Corp', 'N'),
  ('said', 'I'),
  ('it', 'I'),
  ('has', 'I'),
  ('acquired', 'I'),
  ('Thermo-Print', 'N'),
  ('GmbH', 'N'),
  ('of', 'I'),
  ('Lohn', 'N'),
  (',', 'I'),
  ('West', 'N'),
  ('Germany', 'N'),
  (',', 'I'),
  ('a', 'I'),
  ('distributor', 'I'),
  ('of', 'I'),
  ('Paxar', 'N'),
  ('products,', 'I'),
  ('for', 'I'),
  ('undisclosed', 'I'),
  ('terms.', 'I')],
 [('Key', 'N'),
  ('Tronic', 'N'),
  ('corp', 'N'),
  ('said', 'I'),
  ('it', 'I'),
  ('has', 'I'),
  ('received', 'I'),
  ('contracts', 'I'),
  ('to', 'I'),
  ('provide', 'I'),
  ('seven', 'I'),
  ('original', 'I'),
  ('equipment', 'I'),
  ('manufacturers', 'I'),
  ('with', 'I'),
  ('which', 'I'),
  ('it', 'I'),
  ('has', 'I'),
  ('not', 'I'),
  ('done', 'I'),
  ('business', 'I'),
  ('recently', 'I'),
  ('with', 'I'),
  ('over', 'I'),
  ('300,000', 'I'),
  ('computer', 'I'),
  ('keyboards', 'I'),
  ('for', 'I'),
  ('delivery', 'I'),
  ('within', 'I'),
  ('the', 'I'),
  ('next', 'I'),
  ('12', 'I'

In [58]:
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]
    
    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

### Construct Features

In [60]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [64]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [69]:
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [105]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 14294
Seconds required: 0.064

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 6021.786133
Feature norm: 1.000000
Error norm: 6615.960014
Active features: 13838
Line search trials: 1
Line search step: 0.000039
Seconds required for this iteration: 0.007

***** Iteration #2 *****
Loss: 4859.547071
Feature norm: 0.846662
Error norm: 5941.492967
Active features: 13928
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #3 *****
Loss: 4317.979820
Feature norm: 0.820325
Error norm: 13350.767322
Active features: 9302
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #124 *****
Loss: 230.272554
Feature norm: 46.237504
Error norm: 4.598552
Active features: 1819
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #125 *****
Loss: 230.260749
Feature norm: 46.249288
Error norm: 8.603117
Active features: 1818
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #126 *****
Loss: 230.243063
Feature norm: 46.264058
Error norm: 11.605929
Active features: 1820
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #127 *****
Loss: 230.210452
Feature norm: 46.274662
Error norm: 6.766061
Active features: 1819
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #128 *****
Loss: 230.202156
Feature norm: 46.288793
Error norm: 11.676285
Active features: 1818
Line search trials: 1
Line search step: 1.000000
Seconds required for t

***** Iteration #200 *****
Loss: 229.186344
Feature norm: 46.355200
Error norm: 2.826599
Active features: 1762
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 0.850

Storing the model
Number of active features: 1762 (14294)
Number of active attributes: 1300 (13153)
Number of active labels: 2 (2)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.006



In [107]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

autospa (I)
corp (I)
said (I)
it (I)
will (I)
redeem (I)
all (I)
its (I)
common (I)
stock (I)
purchase (I)
warrants (I)
on (I)
may (I)
five (I)
at (I)
7.5 (I)
cts (I)
each. (I)
through (I)
may (I)
four, (I)
each (I)
warrant (I)
may (I)
be (I)
exercised (I)
into (I)
one (I)
common (I)
share (I)
at (I)
1.75 (I)
dlrs. (I)


**SOURCE:**
* <font color='red'>Performing Sequence Labelling using CRF in Python</font>
* https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.crf
* http://www.nltk.org/book/ch00.html
* https://python-crfsuite.readthedocs.io/en/latest/
* https://open.nytimes.com/
* CRF Suite Tutorial: http://www.chokkan.org/software/crfsuite/tutorial.html
* sklearn_crfsuite tutorial: https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* NYT Ingredients Parser: https://github.com/nytimes/ingredient-phrase-tagger
* https://github.com/kulsoom-abdullah/kulsoom-abdullah.github.io/tree/master/AWS-lambda-implementation
* End to End Recipe Cuisine Classification: https://towardsdatascience.com/https-towardsdatascience-com-end-to-end-recipe-cuisine-classification-e97f4ac22104
* Performing Sequence Labelling using CRF in Python: http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/