# Feature Engineering - Simply Recipes Ingredients for CRF

In [1]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## NYT CRF MODEL

In [2]:
nyt_ing = pd.read_csv('../../data/01_raw/nyt-ingredients-snapshot-2015.csv')
nyt_ing.drop(columns=['index'], inplace=True)

In [3]:
nyt_ing.head()

Unnamed: 0,input,name,qty,range_end,unit,comment
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.0,0.0,stalk,chopped coarse
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,0.0,tablespoon,


In [4]:
nyt_ing.fillna("missing", inplace=True)

Let's make ingredients list into tokens

Update fractions so that they are not seperated when the words are tokenized

In [5]:
ingredients_list = list(nyt_ing.input)
ingredients_list_new = []
for ingredient in ingredients_list:
    try: 
        ing_update = re.sub(r'(\d+)\s+(\d)/(\d)', r'\1$\2/\3', ingredient)
        ingredients_list_new.append(ing_update.split(" "))
    except:
        ingredients_list_new.append(np.nan)

Let's build a dictionary of the nyt dataset

In [6]:
# input_list = list(nyt_ing.input)
# name_list = list(nyt_ing.name)
# qty_list = list(nyt_ing.qty)
# unit_list = list(nyt_ing.unit)
# comment_list = list(nyt_ing.comment)

In [7]:
# Let's add the dollar sign to the input list
input_list_new = []
for i in ingredients_list:
    try:
        input_list_new.append(re.sub(r'(\d+)\s+(\d)/(\d)', r'\1$\2/\3', inp))
    except:
        input_list_new.append(np.nan)

In [8]:
def singularize(word):
    """
    A poor replacement for the pattern.en singularize function, but ok for now.
    """

    units = {
        "cups": u"cup",
        "tablespoons": u"tablespoon",
        "teaspoons": u"teaspoon",
        "pounds": u"pound",
        "ounces": u"ounce",
        "cloves": u"clove",
        "sprigs": u"sprig",
        "pinches": u"pinch",
        "bunches": u"bunch",
        "slices": u"slice",
        "grams": u"gram",
        "heads": u"head",
        "quarts": u"quart",
        "stalks": u"stalk",
        "pints": u"pint",
        "pieces": u"piece",
        "sticks": u"stick",
        "dashes": u"dash",
        "fillets": u"fillet",
        "cans": u"can",
        "ears": u"ear",
        "packages": u"package",
        "strips": u"strip",
        "bulbs": u"bulb",
        "bottles": u"bottle"
    }

    if word in units.keys():
        return units[word]
    else:
        return word

I can't make the name, quantity and comment match up with the right word. let's reevaluation what I'm doing. Do I really need to do this this way? 

### Put data in correct format

In [9]:
nyt_ing_tuple = []
for index, row in nyt_ing.iterrows():
    name = row[1]
    qty = row[2]
    unit = row[4]
    comment = row[5]
    nyt_ing_tuple.append([(qty, 'qty'), (unit, 'unit'), (name, 'name'), (comment, 'comment')])

Let's break up the tuples with multiple words. 

In [10]:
nyt_ing_tuple_new = []
for sub_list in nyt_ing_tuple: 
    sub_ls = []
    for elem in sub_list:
        if " " in str(elem[0]):
            elem2 = elem[0].split(" ")
            for el in elem2:
                sub_ls.append((el, elem[1]))
        else: 
            sub_ls.append((elem[0], elem[1]))
    nyt_ing_tuple_new.append(sub_ls)

Let's erase the tuples with missing as a value

In [11]:
for idx_big, ingredient in enumerate(nyt_ing_tuple_new):
    for idx, word in enumerate(ingredient):
        if word[0] == 'missing':
            nyt_ing_tuple_new[idx_big].remove(word)
        if word[0] == '':
            nyt_ing_tuple_new[idx_big].remove(word)

Strip all punctuation from the tuple

In [12]:
crf_data = []
for sub_list in nyt_ing_tuple_new:
    sublist = []
    for word in sub_list:
        word2 = str(word[0]).strip(')!,.?(')
        sublist.append((word2, word[1]))
    crf_data.append(sublist)

In [13]:
count = 1
while count <= 2:
    count = count + 1
    for idx_big, ingredient in enumerate(crf_data):
        for idx, word in enumerate(ingredient):
            if word[0] == 'missing':
                crf_data[idx_big].remove(word)
            if word[0] == '':
                crf_data[idx_big].remove(word)

GREAT, we've got the data in the way that we want. let's construct some features

### Construct Features

### Parts of speech tags

In [14]:
data_nyt = []
for i, doc in enumerate(crf_data):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]
    
    # Perform POS tagging
    try:
        tagged = nltk.pos_tag(tokens)
    except:
        tagged = 'missing'

    # Take the word, POS tag, and its label
    data_nyt.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [15]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [16]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [17]:
X = [extract_features(doc) for doc in data_nyt]
y = [get_labels(doc) for doc in data_nyt]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [37]:
X_test[0]

[['bias',
  'word.lower=8.0',
  'word[-3:]=8.0',
  'word[-2:]=.0',
  'postag=CD',
  'BOS',
  '+1:word.lower=cup',
  '+1:postag=JJ'],
 ['bias',
  'word.lower=cup',
  'word[-3:]=cup',
  'word[-2:]=up',
  'postag=JJ',
  '-1:word.lower=8.0',
  '-1:postag=CD',
  '+1:word.lower=escarole',
  '+1:postag=NN'],
 ['bias',
  'word.lower=escarole',
  'word[-3:]=ole',
  'word[-2:]=le',
  'postag=NN',
  '-1:word.lower=cup',
  '-1:postag=JJ',
  '+1:word.lower=chopped',
  '+1:postag=VBD'],
 ['bias',
  'word.lower=chopped',
  'word[-3:]=ped',
  'word[-2:]=ed',
  'postag=VBD',
  '-1:word.lower=escarole',
  '-1:postag=NN',
  'EOS']]

In [19]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf_ingredients.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 40498
Seconds required: 1.040

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 727732.520528
Feature norm: 1.000000
Error norm: 293664.661254
Active features: 39785
Line search trials: 1
Line search step: 0.000003
Seconds required for this iteration: 0.393

***** Iteration #2 *****
Loss: 246534.335772
Feature norm: 4.938939
Error norm: 150864.381943
Active features: 38434
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.195

***** Iteration #3 *****
Loss: 226339.012178
Feature norm: 5.428840
Error norm: 183778.578257
Active features: 39352
Line search trials: 2
Line search step: 0.500000
Seconds requir

***** Iteration #40 *****
Loss: 28322.032080
Feature norm: 72.532616
Error norm: 4835.322198
Active features: 25899
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.193

***** Iteration #41 *****
Loss: 27717.778364
Feature norm: 76.372170
Error norm: 3739.065933
Active features: 25745
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.197

***** Iteration #42 *****
Loss: 27123.852488
Feature norm: 81.030825
Error norm: 4826.074916
Active features: 25311
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.195

***** Iteration #43 *****
Loss: 26634.997621
Feature norm: 85.260314
Error norm: 3253.155188
Active features: 25435
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.196

***** Iteration #44 *****
Loss: 26128.436360
Feature norm: 90.319424
Error norm: 3667.959159
Active features: 25090
Line search trials: 1
Line search step: 1.000000

***** Iteration #81 *****
Loss: 21399.441928
Feature norm: 162.867326
Error norm: 1171.363282
Active features: 20511
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.204

***** Iteration #82 *****
Loss: 21376.605372
Feature norm: 163.145867
Error norm: 1118.669592
Active features: 20497
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.206

***** Iteration #83 *****
Loss: 21346.604534
Feature norm: 163.498964
Error norm: 941.640030
Active features: 20436
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.199

***** Iteration #84 *****
Loss: 21325.932884
Feature norm: 164.001230
Error norm: 2941.017465
Active features: 20356
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.202

***** Iteration #85 *****
Loss: 21298.797327
Feature norm: 164.326829
Error norm: 1295.546046
Active features: 20398
Line search trials: 1
Line search step: 1.00

***** Iteration #121 *****
Loss: 20889.147125
Feature norm: 171.979874
Error norm: 1265.695028
Active features: 19922
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.198

***** Iteration #122 *****
Loss: 20882.808510
Feature norm: 172.130069
Error norm: 1484.763196
Active features: 19916
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.197

***** Iteration #123 *****
Loss: 20876.312899
Feature norm: 172.275271
Error norm: 1233.565612
Active features: 19896
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.199

***** Iteration #124 *****
Loss: 20870.453093
Feature norm: 172.425457
Error norm: 1544.331721
Active features: 19898
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.201

***** Iteration #125 *****
Loss: 20863.586567
Feature norm: 172.575326
Error norm: 1273.079790
Active features: 19882
Line search trials: 1
Line search step

***** Iteration #161 *****
Loss: 20697.989337
Feature norm: 176.814846
Error norm: 907.620604
Active features: 19683
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.203

***** Iteration #162 *****
Loss: 20694.919054
Feature norm: 176.912033
Error norm: 1158.878689
Active features: 19679
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.204

***** Iteration #163 *****
Loss: 20690.676137
Feature norm: 176.994977
Error norm: 895.989227
Active features: 19678
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.200

***** Iteration #164 *****
Loss: 20688.000011
Feature norm: 177.094429
Error norm: 1214.775742
Active features: 19671
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.195

***** Iteration #165 *****
Loss: 20683.395601
Feature norm: 177.175391
Error norm: 877.238204
Active features: 19687
Line search trials: 1
Line search step: 1

Storing the model
Number of active features: 19548 (40498)
Number of active attributes: 11303 (26803)
Number of active labels: 4 (4)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.047



In [20]:
tagger = pycrfsuite.Tagger()
tagger.open('../../data/04_models/crf_ingredients.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [38]:
y_pred[0]

['qty', 'unit', 'name', 'comment']

In [22]:
mlb = MultiLabelBinarizer()

In [23]:
print(classification_report(y_pred=mlb.fit_transform(y_pred), y_true=mlb.fit_transform(y_test)))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     19749
           1       1.00      1.00      1.00     35747
           2       1.00      1.00      1.00     35842
           3       1.00      1.00      1.00     24504

   micro avg       0.99      1.00      0.99    115842
   macro avg       0.99      1.00      0.99    115842
weighted avg       0.99      1.00      0.99    115842
 samples avg       0.99      1.00      0.99    115842



Our model is almost 100% accurate. That's good enough. Let's train all of our data on the model and move onto using it for our ingredients. 

In [24]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('../../data/04_models/crf_ing_final.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 44319
Seconds required: 1.165

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 910350.519284
Feature norm: 1.000000
Error norm: 367077.603587
Active features: 43524
Line search trials: 1
Line search step: 0.000002
Seconds required for this iteration: 0.507

***** Iteration #2 *****
Loss: 309213.712708
Feature norm: 4.927604
Error norm: 190852.363998
Active features: 42108
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.254

***** Iteration #3 *****
Loss: 284765.219526
Feature norm: 5.423585
Error norm: 233315.887163
Active features: 43052
Line search trials: 2
Line search step: 0.500000
Seconds requir

***** Iteration #39 *****
Loss: 37786.514836
Feature norm: 65.202423
Error norm: 5453.792038
Active features: 28572
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.260

***** Iteration #40 *****
Loss: 36818.985175
Feature norm: 69.298507
Error norm: 8204.589851
Active features: 28062
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.259

***** Iteration #41 *****
Loss: 35978.552144
Feature norm: 72.612791
Error norm: 4076.994352
Active features: 28111
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.262

***** Iteration #42 *****
Loss: 35103.332195
Feature norm: 77.482222
Error norm: 6065.924873
Active features: 27836
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.273

***** Iteration #43 *****
Loss: 34392.475592
Feature norm: 81.658283
Error norm: 4946.686239
Active features: 27715
Line search trials: 1
Line search step: 1.000000

***** Iteration #80 *****
Loss: 27098.014412
Feature norm: 168.903415
Error norm: 2261.248834
Active features: 22858
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.255

***** Iteration #81 *****
Loss: 27064.967961
Feature norm: 169.309394
Error norm: 2240.513240
Active features: 22799
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.259

***** Iteration #82 *****
Loss: 27032.577399
Feature norm: 169.687251
Error norm: 1549.822734
Active features: 22760
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.264

***** Iteration #83 *****
Loss: 26999.550502
Feature norm: 170.081491
Error norm: 1795.400814
Active features: 22769
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.256

***** Iteration #84 *****
Loss: 26963.550269
Feature norm: 170.424038
Error norm: 1535.183615
Active features: 22655
Line search trials: 1
Line search step: 1.0

***** Iteration #120 *****
Loss: 26379.665711
Feature norm: 180.232982
Error norm: 2373.311105
Active features: 21789
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.254

***** Iteration #121 *****
Loss: 26369.496765
Feature norm: 180.420715
Error norm: 1430.201421
Active features: 21812
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.260

***** Iteration #122 *****
Loss: 26361.044306
Feature norm: 180.572721
Error norm: 1890.455939
Active features: 21815
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.254

***** Iteration #123 *****
Loss: 26352.618687
Feature norm: 180.734216
Error norm: 1095.335530
Active features: 21819
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.253

***** Iteration #124 *****
Loss: 26345.891858
Feature norm: 180.894527
Error norm: 2064.612072
Active features: 21801
Line search trials: 1
Line search step

***** Iteration #159 *****
Loss: 26129.413849
Feature norm: 185.990495
Error norm: 950.390088
Active features: 21706
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.271

***** Iteration #160 *****
Loss: 26127.061036
Feature norm: 186.092808
Error norm: 1712.519746
Active features: 21708
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.264

***** Iteration #161 *****
Loss: 26119.982227
Feature norm: 186.186700
Error norm: 1079.381026
Active features: 21705
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.261

***** Iteration #162 *****
Loss: 26118.677486
Feature norm: 186.305181
Error norm: 2028.301256
Active features: 21691
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.270

***** Iteration #163 *****
Loss: 26110.250661
Feature norm: 186.403554
Error norm: 1065.337233
Active features: 21685
Line search trials: 1
Line search step:

***** Iteration #199 *****
Loss: 25968.203126
Feature norm: 190.015535
Error norm: 919.736652
Active features: 21530
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.268

***** Iteration #200 *****
Loss: 25966.056387
Feature norm: 190.082711
Error norm: 1392.551111
Active features: 21522
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.277

L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 52.705

Storing the model
Number of active features: 21522 (44319)
Number of active attributes: 12420 (29314)
Number of active labels: 4 (4)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.053



---

---

## CRF TUTORIAL: Performing Sequence Labelling using CRF in Python

In [25]:
# from bs4 import BeautifulSoup as bs
# from bs4.element import Tag
# import codecs

# # Read data file and parse the XML
# with codecs.open("../../data/tutorials/reuters.xml", "r", "utf-8") as infile:
#     soup = bs(infile, "html5lib")

### Put Data in form that CRF likes it

In [26]:
# docs = []
# for elem in soup.find_all("document"):
#     texts = []

#     # Loop through each child of the element under "textwithnamedentities"
#     for c in elem.find("textwithnamedentities").children:
#         if type(c) == Tag:
#             if c.name == "namedentityintext":
#                 label = "N"  # part of a named entity
#             else:
#                 label = "I"  # irrelevant word
#             for w in c.text.split(" "):
#                 if len(w) > 0:
#                     texts.append((w, label))
#     docs.append(texts)

In [27]:
# data = []
# for i, doc in enumerate(docs):

#     # Obtain the list of tokens in the document
#     tokens = [t for t, label in doc]
    
#     # Perform POS tagging
#     tagged = nltk.pos_tag(tokens)

#     # Take the word, POS tag, and its label
#     data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

### Construct Features

In [28]:
# def word2features(doc, i):
#     word = doc[i][0]
#     postag = doc[i][1]

#     # Common features for all words
#     features = [
#         'bias',
#         'word.lower=' + word.lower(),
#         'word[-3:]=' + word[-3:],
#         'word[-2:]=' + word[-2:],
#         'word.isupper=%s' % word.isupper(),
#         'word.istitle=%s' % word.istitle(),
#         'word.isdigit=%s' % word.isdigit(),
#         'postag=' + postag
#     ]

#     # Features for words that are not
#     # at the beginning of a document
#     if i > 0:
#         word1 = doc[i-1][0]
#         postag1 = doc[i-1][1]
#         features.extend([
#             '-1:word.lower=' + word1.lower(),
#             '-1:word.istitle=%s' % word1.istitle(),
#             '-1:word.isupper=%s' % word1.isupper(),
#             '-1:word.isdigit=%s' % word1.isdigit(),
#             '-1:postag=' + postag1
#         ])
#     else:
#         # Indicate that it is the 'beginning of a document'
#         features.append('BOS')

#     # Features for words that are not
#     # at the end of a document
#     if i < len(doc)-1:
#         word1 = doc[i+1][0]
#         postag1 = doc[i+1][1]
#         features.extend([
#             '+1:word.lower=' + word1.lower(),
#             '+1:word.istitle=%s' % word1.istitle(),
#             '+1:word.isupper=%s' % word1.isupper(),
#             '+1:word.isdigit=%s' % word1.isdigit(),
#             '+1:postag=' + postag1
#         ])
#     else:
#         # Indicate that it is the 'end of a document'
#         features.append('EOS')

#     return features

In [29]:
# # A function for extracting features in documents
# def extract_features(doc):
#     return [word2features(doc, i) for i in range(len(doc))]

# # A function fo generating the list of labels for each document
# def get_labels(doc):
#     return [label for (token, postag, label) in doc]

In [30]:
# X = [extract_features(doc) for doc in data]
# y = [get_labels(doc) for doc in data]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
# trainer = pycrfsuite.Trainer(verbose=True)

# # Submit training data to the trainer
# for xseq, yseq in zip(X_train, y_train):
#     trainer.append(xseq, yseq)

# # Set the parameters of the model
# trainer.set_params({
#     # coefficient for L1 penalty
#     'c1': 0.1,

#     # coefficient for L2 penalty
#     'c2': 0.01,  

#     # maximum number of iterations
#     'max_iterations': 200,

#     # whether to include transitions that
#     # are possible, but not observed
#     'feature.possible_transitions': True
# })

# # Provide a file name as a parameter to the train function, such that
# # the model will be saved to the file when training is finished
# trainer.train('crf.model')

In [32]:
# tagger = pycrfsuite.Tagger()
# tagger.open('crf.model')
# y_pred = [tagger.tag(xseq) for xseq in X_test]

# # Let's take a look at a random sample in the testing set
# i = 12
# for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
#     print("%s (%s)" % (y, x))

**SOURCE:**
* <font color='red'>Performing Sequence Labelling using CRF in Python</font>
* https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.crf
* http://www.nltk.org/book/ch00.html
* https://python-crfsuite.readthedocs.io/en/latest/
* https://open.nytimes.com/
* CRF Suite Tutorial: http://www.chokkan.org/software/crfsuite/tutorial.html
* sklearn_crfsuite tutorial: https://eli5.readthedocs.io/en/latest/tutorials/sklearn_crfsuite.html
* NYT Ingredients Parser: https://github.com/nytimes/ingredient-phrase-tagger
* https://github.com/kulsoom-abdullah/kulsoom-abdullah.github.io/tree/master/AWS-lambda-implementation
* End to End Recipe Cuisine Classification: https://towardsdatascience.com/https-towardsdatascience-com-end-to-end-recipe-cuisine-classification-e97f4ac22104
* Performing Sequence Labelling using CRF in Python: http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/

# LET'S MAKE A FUNCTION