# CRF Feature Engineering - Marianos Groceries

In [4]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [16]:
grocery_prices_marianos = pd.read_csv('../../data/02_intermediate/marianos_groceries_only.csv')
crf_train_marianos = pd.read_csv('../../data/01_raw/marianos_product_train_complete.csv')

In [17]:
crf_train_marianos.head()

Unnamed: 0,item,pre,food,post
0,Green Onions (Scallions),Green Onions,Scallions,
1,Cilantro,,Cilantro,
2,Italian Parsley,Italian,Parsley,
3,Roundy's Organic Fresh Rosemary,Roundy's Organic Fresh,Rosemary,
4,Roundy's Organic Fresh Thyme,Roundy's Organic Fresh,Thyme,


In [21]:
crf_train_marianos.fillna('missing', inplace=True)

In [22]:
marianos_product_tuple = []
for index, row in  crf_train_marianos.iterrows():
    pre_description = row[1].lower()
    food = row[2].lower()
    post_description = row[3].lower()
    marianos_product_tuple.append([(pre_description, 'pre'), (food, 'food'), (post_description, 'post')])

In [24]:
marianos_product_tuple_new = []
for sub_list in marianos_product_tuple: 
    sub_ls = []
    for elem in sub_list:
        if " " in str(elem[0]):
            elem2 = elem[0].split(" ")
            for el in elem2:
                sub_ls.append((el, elem[1]))
        else: 
            sub_ls.append((elem[0], elem[1]))
    marianos_product_tuple_new.append(sub_ls)

Erase the tuples with missing as a value

In [26]:
for idx_big, product in enumerate(marianos_product_tuple_new):
    for idx, word in enumerate(product):
        if word[0] == 'missing':
            marianos_product_tuple_new[idx_big].remove(word)
        if word[0] == '':
            marianos_product_tuple_new[idx_big].remove(word)

In [27]:
marianos_product_tuple_new

[[('green', 'pre'), ('onions', 'pre'), ('scallions', 'food')],
 [('cilantro', 'food')],
 [('italian', 'pre'), ('parsley', 'food')],
 [("roundy's", 'pre'),
  ('organic', 'pre'),
  ('fresh', 'pre'),
  ('rosemary', 'food')],
 [("roundy's", 'pre'),
  ('organic', 'pre'),
  ('fresh', 'pre'),
  ('thyme', 'food')],
 [("roundy's", 'pre'), ('mint', 'food')],
 [('basil', 'food')],
 [('dill', 'food')],
 [("roundy's", 'pre'),
  ('organics', 'pre'),
  ('fresh', 'pre'),
  ('dill', 'food')],
 [('gourmet', 'pre'),
  ('garden™', 'pre'),
  ('ginger', 'food'),
  ('stir-in', 'food'),
  ('paste', 'food')],
 [('organic', 'pre'), ('chives', 'food')],
 [('simple', 'pre'),
  ('truth', 'pre'),
  ('organic', 'pre'),
  ('ithyme', 'food'),
  ('leaves', 'food')],
 [('organic', 'pre'), ('curly', 'pre'), ('parsley', 'food')],
 [("roundy's", 'pre'), ('organics', 'pre'), ('sage', 'food')],
 [("roundy's", 'pre'), ('bay', 'food'), ('leaf', 'food')],
 [('bellino', 'pre'),
  ('peeled', 'pre'),
  ('garlic', 'food'),
  ('clov

In [30]:
crf_data = []
for sub_list in marianos_product_tuple_new:
    sublist = []
    for word in sub_list:
        word2 = str(word[0]).strip(")!,.'?(")
        word2.
        sublist.append((word2, word[1]))
    crf_data.append(sublist)

In [36]:
data_marianos = []
for i, doc in enumerate(crf_data):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]
    
    # Perform POS tagging
    try:
        tagged = nltk.pos_tag(tokens)
    except:
        tagged = 'missing'

    # Take the word, POS tag, and its label
    data_marianos.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [37]:
data_marianos

[[('green', 'JJ', 'pre'),
  ('onions', 'NNS', 'pre'),
  ('scallions', 'NNS', 'food')],
 [('cilantro', 'NN', 'food')],
 [('italian', 'JJ', 'pre'), ('parsley', 'NN', 'food')],
 [("roundy's", 'NN', 'pre'),
  ('organic', 'JJ', 'pre'),
  ('fresh', 'JJ', 'pre'),
  ('rosemary', 'NN', 'food')],
 [("roundy's", 'NN', 'pre'),
  ('organic', 'JJ', 'pre'),
  ('fresh', 'JJ', 'pre'),
  ('thyme', 'NN', 'food')],
 [("roundy's", 'NN', 'pre'), ('mint', 'NN', 'food')],
 [('basil', 'NN', 'food')],
 [('dill', 'NN', 'food')],
 [("roundy's", 'NN', 'pre'),
  ('organics', 'NNS', 'pre'),
  ('fresh', 'JJ', 'pre'),
  ('dill', 'NN', 'food')],
 [('gourmet', 'NN', 'pre'),
  ('garden™', 'NN', 'pre'),
  ('ginger', 'NN', 'food'),
  ('stir-in', 'JJ', 'food'),
  ('paste', 'NN', 'food')],
 [('organic', 'JJ', 'pre'), ('chives', 'NNS', 'food')],
 [('simple', 'JJ', 'pre'),
  ('truth', 'NN', 'pre'),
  ('organic', 'JJ', 'pre'),
  ('ithyme', 'NN', 'food'),
  ('leaves', 'NNS', 'food')],
 [('organic', 'JJ', 'pre'), ('curly', 'RB', 

In [32]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [33]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

In [38]:
X = [extract_features(doc) for doc in data_marianos]
y = [get_labels(doc) for doc in data_marianos]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [39]:
X_test[0]

[['bias',
  'word.lower=aunt',
  'word[-3:]=unt',
  'word[-2:]=nt',
  'postag=NN',
  'BOS',
  '+1:word.lower=jemima',
  '+1:postag=NN'],
 ['bias',
  'word.lower=jemima',
  'word[-3:]=ima',
  'word[-2:]=ma',
  'postag=NN',
  '-1:word.lower=aunt',
  '-1:postag=NN',
  '+1:word.lower=original',
  '+1:postag=JJ'],
 ['bias',
  'word.lower=original',
  'word[-3:]=nal',
  'word[-2:]=al',
  'postag=JJ',
  '-1:word.lower=jemima',
  '-1:postag=NN',
  '+1:word.lower=lite',
  '+1:postag=NN'],
 ['bias',
  'word.lower=lite',
  'word[-3:]=ite',
  'word[-2:]=te',
  'postag=NN',
  '-1:word.lower=original',
  '-1:postag=JJ',
  '+1:word.lower=syrup',
  '+1:postag=NN'],
 ['bias',
  'word.lower=syrup',
  'word[-3:]=rup',
  'word[-2:]=up',
  'postag=NN',
  '-1:word.lower=lite',
  '-1:postag=NN',
  'EOS']]

In [40]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('../../data/04_models/crf_marianos_initial.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 6345
Seconds required: 0.028

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 4030.745503
Feature norm: 1.000000
Error norm: 1679.617297
Active features: 6063
Line search trials: 1
Line search step: 0.000336
Seconds required for this iteration: 0.003

***** Iteration #2 *****
Loss: 3593.071453
Feature norm: 1.127907
Error norm: 1013.006264
Active features: 5968
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #3 *****
Loss: 3055.428271
Feature norm: 2.011533
Error norm: 1122.140328
Active features: 6030
Line search trials: 1
Line search step: 1.000000
Seconds required for this iter

***** Iteration #195 *****
Loss: 374.811122
Feature norm: 63.759939
Error norm: 1.189016
Active features: 1990
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.005

***** Iteration #196 *****
Loss: 374.811114
Feature norm: 63.757390
Error norm: 3.877805
Active features: 1988
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.002

***** Iteration #197 *****
Loss: 374.797730
Feature norm: 63.760766
Error norm: 2.398193
Active features: 1989
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.002

***** Iteration #198 *****
Loss: 374.795467
Feature norm: 63.757153
Error norm: 3.410581
Active features: 1986
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.002

***** Iteration #199 *****
Loss: 374.783747
Feature norm: 63.760073
Error norm: 1.988655
Active features: 1986
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

In [41]:
tagger = pycrfsuite.Tagger()
tagger.open('../../data/04_models/crf_marianos_initial.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [42]:
mlb = MultiLabelBinarizer()
print(classification_report(y_pred=mlb.fit_transform(y_pred), y_true=mlb.fit_transform(y_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       0.85      0.68      0.76        50
           2       0.96      1.00      0.98       222

   micro avg       0.97      0.97      0.97       519
   macro avg       0.94      0.89      0.91       519
weighted avg       0.97      0.97      0.97       519
 samples avg       0.97      0.98      0.97       519



In [43]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('../../data/04_models/crf_marianos_final.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 6926
Seconds required: 0.030

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5011.595375
Feature norm: 1.000000
Error norm: 2082.053696
Active features: 6661
Line search trials: 1
Line search step: 0.000268
Seconds required for this iteration: 0.004

***** Iteration #2 *****
Loss: 4445.485064
Feature norm: 1.140086
Error norm: 1155.717395
Active features: 6501
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #3 *****
Loss: 3937.341623
Feature norm: 1.759156
Error norm: 1237.148709
Active features: 6658
Line search trials: 1
Line search step: 1.000000
Seconds required for this iter

Storing the model
Number of active features: 2267 (6926)
Number of active attributes: 1723 (4921)
Number of active labels: 3 (3)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.007

