# Feature Engineering - Instacart Products Ingredients for CRF

In [32]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [33]:
instacart_prod_train = pd.read_csv('../../data/02_intermediate/instacart_product_train.csv')

In [34]:
instacart_prod_train.head()

Unnamed: 0,products,pre_description,food,post_description
0,Organic Egg Whites,Organic,Egg Whites,
1,Michigan Organic Kale,Michigan Organic,Kale,
2,Garlic Powder,,Garlic Powder,
3,Coconut Butter,,Coconut Butter,
4,Natural Sweetener,,natural sweetener,


In [35]:
instacart_prod_train.fillna("missing", inplace=True)

### Put data in correct format

In [36]:
insta_product_tuple = []
for index, row in instacart_prod_train.iterrows():
    pre_description = row[1].lower()
    food = row[2].lower()
    post_description = row[3].lower()
    insta_product_tuple.append([(pre_description, 'pre'), (food, 'food'), (post_description, 'post')])

In [37]:
insta_product_tuple

[[('organic', 'pre'), ('egg whites', 'food'), ('missing', 'post')],
 [('michigan organic', 'pre'), ('kale', 'food'), ('missing', 'post')],
 [('missing', 'pre'), ('garlic powder', 'food'), ('missing', 'post')],
 [('missing', 'pre'), ('coconut butter', 'food'), ('missing', 'post')],
 [('missing', 'pre'), ('natural sweetener', 'food'), ('missing', 'post')],
 [('missing', 'pre'), ('carrots', 'food'), ('missing', 'post')],
 [('original unflavored', 'pre'),
  ('gelatine mix', 'food'),
  ('missing', 'post')],
 [('all natural no stir creamy', 'pre'),
  ('almond butter', 'food'),
  ('missing', 'post')],
 [('classic blend', 'pre'), ('cole slaw', 'food'), ('missing', 'post')],
 [('total 2% with strawberry lowfat greek strained', 'pre'),
  ('yogurt', 'food'),
  ('missing', 'post')],
 [('unsweetened', 'pre'), ('almondmilk', 'food'), ('missing', 'post')],
 [('missing', 'pre'), ('lemons', 'food'), ('missing', 'post')],
 [('organic baby', 'pre'), ('spinach', 'food'), ('missing', 'post')],
 [('unsweete

Let's break up the tuples with multiple words. 

In [38]:
insta_product_tuple_new = []
for sub_list in insta_product_tuple: 
    sub_ls = []
    for elem in sub_list:
        if " " in str(elem[0]):
            elem2 = elem[0].split(" ")
            for el in elem2:
                sub_ls.append((el, elem[1]))
        else: 
            sub_ls.append((elem[0], elem[1]))
    insta_product_tuple_new.append(sub_ls)

In [39]:
insta_product_tuple_new

[[('organic', 'pre'),
  ('egg', 'food'),
  ('whites', 'food'),
  ('missing', 'post')],
 [('michigan', 'pre'),
  ('organic', 'pre'),
  ('kale', 'food'),
  ('missing', 'post')],
 [('missing', 'pre'),
  ('garlic', 'food'),
  ('powder', 'food'),
  ('missing', 'post')],
 [('missing', 'pre'),
  ('coconut', 'food'),
  ('butter', 'food'),
  ('missing', 'post')],
 [('missing', 'pre'),
  ('natural', 'food'),
  ('sweetener', 'food'),
  ('missing', 'post')],
 [('missing', 'pre'), ('carrots', 'food'), ('missing', 'post')],
 [('original', 'pre'),
  ('unflavored', 'pre'),
  ('gelatine', 'food'),
  ('mix', 'food'),
  ('missing', 'post')],
 [('all', 'pre'),
  ('natural', 'pre'),
  ('no', 'pre'),
  ('stir', 'pre'),
  ('creamy', 'pre'),
  ('almond', 'food'),
  ('butter', 'food'),
  ('missing', 'post')],
 [('classic', 'pre'),
  ('blend', 'pre'),
  ('cole', 'food'),
  ('slaw', 'food'),
  ('missing', 'post')],
 [('total', 'pre'),
  ('2%', 'pre'),
  ('with', 'pre'),
  ('strawberry', 'pre'),
  ('lowfat', 'pre

Erase the tuples with missing as a value

In [40]:
for idx_big, product in enumerate(insta_product_tuple_new):
    for idx, word in enumerate(product):
        if word[0] == 'missing':
            insta_product_tuple_new[idx_big].remove(word)
        if word[0] == '':
            insta_product_tuple_new[idx_big].remove(word)

In [41]:
insta_product_tuple_new

[[('organic', 'pre'), ('egg', 'food'), ('whites', 'food')],
 [('michigan', 'pre'), ('organic', 'pre'), ('kale', 'food')],
 [('garlic', 'food'), ('powder', 'food')],
 [('coconut', 'food'), ('butter', 'food')],
 [('natural', 'food'), ('sweetener', 'food')],
 [('carrots', 'food')],
 [('original', 'pre'),
  ('unflavored', 'pre'),
  ('gelatine', 'food'),
  ('mix', 'food')],
 [('all', 'pre'),
  ('natural', 'pre'),
  ('no', 'pre'),
  ('stir', 'pre'),
  ('creamy', 'pre'),
  ('almond', 'food'),
  ('butter', 'food')],
 [('classic', 'pre'), ('blend', 'pre'), ('cole', 'food'), ('slaw', 'food')],
 [('total', 'pre'),
  ('2%', 'pre'),
  ('with', 'pre'),
  ('strawberry', 'pre'),
  ('lowfat', 'pre'),
  ('greek', 'pre'),
  ('strained', 'pre'),
  ('yogurt', 'food')],
 [('unsweetened', 'pre'), ('almondmilk', 'food')],
 [('lemons', 'food')],
 [('organic', 'pre'), ('baby', 'pre'), ('spinach', 'food')],
 [('unsweetened', 'pre'),
  ('chocolate', 'pre'),
  ('almond', 'food'),
  ('milk', 'food')],
 [('organic',

In [42]:
crf_data = []
for sub_list in insta_product_tuple_new:
    sublist = []
    for word in sub_list:
        word2 = str(word[0]).strip(')!,.?(')
        sublist.append((word2, word[1]))
    crf_data.append(sublist)

In [43]:
crf_data

[[('organic', 'pre'), ('egg', 'food'), ('whites', 'food')],
 [('michigan', 'pre'), ('organic', 'pre'), ('kale', 'food')],
 [('garlic', 'food'), ('powder', 'food')],
 [('coconut', 'food'), ('butter', 'food')],
 [('natural', 'food'), ('sweetener', 'food')],
 [('carrots', 'food')],
 [('original', 'pre'),
  ('unflavored', 'pre'),
  ('gelatine', 'food'),
  ('mix', 'food')],
 [('all', 'pre'),
  ('natural', 'pre'),
  ('no', 'pre'),
  ('stir', 'pre'),
  ('creamy', 'pre'),
  ('almond', 'food'),
  ('butter', 'food')],
 [('classic', 'pre'), ('blend', 'pre'), ('cole', 'food'), ('slaw', 'food')],
 [('total', 'pre'),
  ('2%', 'pre'),
  ('with', 'pre'),
  ('strawberry', 'pre'),
  ('lowfat', 'pre'),
  ('greek', 'pre'),
  ('strained', 'pre'),
  ('yogurt', 'food')],
 [('unsweetened', 'pre'), ('almondmilk', 'food')],
 [('lemons', 'food')],
 [('organic', 'pre'), ('baby', 'pre'), ('spinach', 'food')],
 [('unsweetened', 'pre'),
  ('chocolate', 'pre'),
  ('almond', 'food'),
  ('milk', 'food')],
 [('organic',

In [44]:
count = 1
while count <= 2:
    count = count + 1
    for idx_big, ingredient in enumerate(crf_data):
        for idx, word in enumerate(ingredient):
            if word[0] == 'missing':
                crf_data[idx_big].remove(word)
            if word[0] == '':
                crf_data[idx_big].remove(word)

### Construct Features

### Parts of speech tags

In [45]:
data_instacart = []
for i, doc in enumerate(crf_data):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]
    
    # Perform POS tagging
    try:
        tagged = nltk.pos_tag(tokens)
    except:
        tagged = 'missing'

    # Take the word, POS tag, and its label
    data_instacart.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [46]:
data_instacart

[[('organic', 'JJ', 'pre'), ('egg', 'NN', 'food'), ('whites', 'NNS', 'food')],
 [('michigan', 'JJ', 'pre'), ('organic', 'JJ', 'pre'), ('kale', 'NN', 'food')],
 [('garlic', 'JJ', 'food'), ('powder', 'NN', 'food')],
 [('coconut', 'NN', 'food'), ('butter', 'NN', 'food')],
 [('natural', 'JJ', 'food'), ('sweetener', 'NN', 'food')],
 [('carrots', 'NNS', 'food')],
 [('original', 'JJ', 'pre'),
  ('unflavored', 'JJ', 'pre'),
  ('gelatine', 'NN', 'food'),
  ('mix', 'NN', 'food')],
 [('all', 'DT', 'pre'),
  ('natural', 'JJ', 'pre'),
  ('no', 'DT', 'pre'),
  ('stir', 'NN', 'pre'),
  ('creamy', 'VBZ', 'pre'),
  ('almond', 'NN', 'food'),
  ('butter', 'NN', 'food')],
 [('classic', 'JJ', 'pre'),
  ('blend', 'NN', 'pre'),
  ('cole', 'NN', 'food'),
  ('slaw', 'NN', 'food')],
 [('total', 'JJ', 'pre'),
  ('2%', 'CD', 'pre'),
  ('with', 'IN', 'pre'),
  ('strawberry', 'JJ', 'pre'),
  ('lowfat', 'JJ', 'pre'),
  ('greek', 'NN', 'pre'),
  ('strained', 'VBD', 'pre'),
  ('yogurt', 'NN', 'food')],
 [('unsweetened

In [47]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [48]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [49]:
X = [extract_features(doc) for doc in data_instacart]
y = [get_labels(doc) for doc in data_instacart]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [51]:
X_test[0]

[['bias',
  'word.lower=super',
  'word[-3:]=per',
  'word[-2:]=er',
  'postag=JJ',
  'BOS',
  '+1:word.lower=spinach',
  '+1:postag=NN'],
 ['bias',
  'word.lower=spinach',
  'word[-3:]=ach',
  'word[-2:]=ch',
  'postag=NN',
  '-1:word.lower=super',
  '-1:postag=JJ',
  '+1:word.lower=baby',
  '+1:postag=NN'],
 ['bias',
  'word.lower=baby',
  'word[-3:]=aby',
  'word[-2:]=by',
  'postag=NN',
  '-1:word.lower=spinach',
  '-1:postag=NN',
  '+1:word.lower=spinach',
  '+1:postag=NN'],
 ['bias',
  'word.lower=spinach',
  'word[-3:]=ach',
  'word[-2:]=ch',
  'postag=NN',
  '-1:word.lower=baby',
  '-1:postag=NN',
  '+1:word.lower=baby',
  '+1:postag=NN'],
 ['bias',
  'word.lower=baby',
  'word[-3:]=aby',
  'word[-2:]=by',
  'postag=NN',
  '-1:word.lower=spinach',
  '-1:postag=NN',
  '+1:word.lower=bok',
  '+1:postag=VBD'],
 ['bias',
  'word.lower=bok',
  'word[-3:]=bok',
  'word[-2:]=ok',
  'postag=VBD',
  '-1:word.lower=baby',
  '-1:postag=NN',
  '+1:word.lower=choy',
  '+1:postag=JJ'],
 ['bi

In [55]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('../../data/04_models/crf_ingredients_initial.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 4137
Seconds required: 0.016

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 2163.631199
Feature norm: 1.000000
Error norm: 867.734984
Active features: 3945
Line search trials: 1
Line search step: 0.000523
Seconds required for this iteration: 0.002

***** Iteration #2 *****
Loss: 1849.200839
Feature norm: 1.401711
Error norm: 493.891692
Active features: 3888
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.001

***** Iteration #3 *****
Loss: 1612.854220
Feature norm: 2.090225
Error norm: 389.153273
Active features: 3948
Line search trials: 1
Line search step: 1.000000
Seconds required for this iterati

In [56]:
tagger = pycrfsuite.Tagger()
tagger.open('../../data/04_models/crf_ingredients_initial.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [58]:
mlb = MultiLabelBinarizer()
print(classification_report(y_pred=mlb.fit_transform(y_pred), y_true=mlb.fit_transform(y_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       194
           1       0.86      0.79      0.83        39
           2       0.94      0.99      0.97       168

   micro avg       0.96      0.98      0.97       401
   macro avg       0.93      0.93      0.93       401
weighted avg       0.96      0.98      0.97       401
 samples avg       0.96      0.97      0.96       401



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Our model is almost 100% accurate. That's good enough. Let's train all of our data on the model and move onto using it for our ingredients. 

In [59]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('../../data/04_models/crf_instacart_products_final.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 4719
Seconds required: 0.020

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 2746.490798
Feature norm: 1.000000
Error norm: 1104.380669
Active features: 4506
Line search trials: 1
Line search step: 0.000414
Seconds required for this iteration: 0.002

***** Iteration #2 *****
Loss: 2369.340357
Feature norm: 1.378173
Error norm: 646.726482
Active features: 4439
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.001

***** Iteration #3 *****
Loss: 2054.669438
Feature norm: 2.107407
Error norm: 504.188817
Active features: 4489
Line search trials: 1
Line search step: 1.000000
Seconds required for this iterat