# Feature Engineering - Simply Recipes Ingredients for CRF

* https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.crf
* http://www.nltk.org/book/ch00.html
* https://python-crfsuite.readthedocs.io/en/latest/
* https://open.nytimes.com/
* CRF Suite Tutorial: http://www.chokkan.org/software/crfsuite/tutorial.html
* sklearn_crfsuite tutorial: https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* NYT Ingredients Parser: https://github.com/nytimes/ingredient-phrase-tagger
* https://github.com/kulsoom-abdullah/kulsoom-abdullah.github.io/tree/master/AWS-lambda-implementation
* End to End Recipe Cuisine Classification: https://towardsdatascience.com/https-towardsdatascience-com-end-to-end-recipe-cuisine-classification-e97f4ac22104
* Performing Sequence Labelling using CRF in Python: http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/

In [101]:
import pandas as pd
import pickle
import nltk
import pycrfsuite
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
nyt_ingredient_trainingdata = pd.read_csv('../../data/01_raw/nyt-ingredients-snapshot-2015.csv')

In [8]:
nyt_ingredient_trainingdata.drop(columns=['index'], inplace=True)

In [15]:
nyt_ingredient_trainingdata

Unnamed: 0,input,name,qty,range_end,unit,comment
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.00,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium-size onion, peeled and chopped",onion,1.00,0.0,,"medium-size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.00,0.0,stalk,chopped coarse
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.50,0.0,tablespoon,
5,,water,0.50,0.0,cup,
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.00,0.0,tablespoon,"unflavored, dissolved in 1/2 cup water"
7,Salt,Salt,0.00,0.0,,
8,1 cup canned plum tomatoes with juice,plum tomatoes,1.00,0.0,cup,"canned, with juice"
9,6 cups veal or beef stock,stock,6.00,0.0,cup,veal or beef


## TUTORIAL: Performing Sequence Labelling using CRF in Python

In [None]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

# Read data file and parse the XML
with codecs.open("../../data/tutorials/reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

In [38]:
docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

In [58]:
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]
    
    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [60]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [64]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [69]:
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [100]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
#     trainer.append(xseq, yseq)

# # Set the parameters of the model
# trainer.set_params({
#     # coefficient for L1 penalty
#     'c1': 0.1,

#     # coefficient for L2 penalty
#     'c2': 0.01,  

#     # maximum number of iterations
#     'max_iterations': 200,

#     # whether to include transitions that
#     # are possible, but not observed
#     'feature.possible_transitions': True
# })

# # Provide a file name as a parameter to the train function, such that
# # the model will be saved to the file when training is finished
# trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 14294
Seconds required: 0.063

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 6021.786133
Feature norm: 1.000000
Error norm: 6615.960014
Active features: 13838
Line search trials: 1
Line search step: 0.000039
Seconds required for this iteration: 0.007

***** Iteration #2 *****
Loss: 4859.547071
Feature norm: 0.846662
Error norm: 5941.492967
Active features: 13928
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #3 *****
Loss: 4317.979820
Feature norm: 0.820325
Error norm: 13350.767322
Active features: 9302
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #61 *****
Loss: 234.325394
Feature norm: 45.302880
Error norm: 7.256259
Active features: 2032
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #62 *****
Loss: 234.103575
Feature norm: 45.335504
Error norm: 6.221479
Active features: 2028
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #63 *****
Loss: 233.920586
Feature norm: 45.384918
Error norm: 14.159731
Active features: 2027
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #64 *****
Loss: 233.729340
Feature norm: 45.443783
Error norm: 8.514893
Active features: 2005
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #65 *****
Loss: 233.593689
Feature norm: 45.473202
Error norm: 8.356149
Active features: 2000
Line search trials: 1
Line search step: 1.000000
Seconds required for this it

***** Iteration #124 *****
Loss: 230.272554
Feature norm: 46.237504
Error norm: 4.598552
Active features: 1819
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #125 *****
Loss: 230.260749
Feature norm: 46.249288
Error norm: 8.603117
Active features: 1818
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #126 *****
Loss: 230.243063
Feature norm: 46.264058
Error norm: 11.605929
Active features: 1820
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #127 *****
Loss: 230.210452
Feature norm: 46.274662
Error norm: 6.766061
Active features: 1819
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #128 *****
Loss: 230.202156
Feature norm: 46.288793
Error norm: 11.676285
Active features: 1818
Line search trials: 1
Line search step: 1.000000
Seconds required for t

***** Iteration #175 *****
Loss: 229.449639
Feature norm: 46.396921
Error norm: 6.826676
Active features: 1781
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #176 *****
Loss: 229.439286
Feature norm: 46.403702
Error norm: 8.184476
Active features: 1777
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #177 *****
Loss: 229.412449
Feature norm: 46.400678
Error norm: 3.418292
Active features: 1773
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #178 *****
Loss: 229.410361
Feature norm: 46.397086
Error norm: 7.729357
Active features: 1774
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #179 *****
Loss: 229.389002
Feature norm: 46.392769
Error norm: 4.015936
Active features: 1772
Line search trials: 1
Line search step: 1.000000
Seconds required for thi