# Tagging Products - Marianos Product Dataset

In [1]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
import numpy as np
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
grocery_prices_marianos = pd.read_csv('../../data/02_intermediate/marianos_groceries_only.csv')

In [3]:
grocery_prices_marianos

Unnamed: 0,product,main_price,prod_aile,price_per_lb,measure_words_main_price,item_weight_count_vol,date_collected,store,location
0,Green Onions (Scallions),$0.99,Fresh Herbs,,each,,2019-08-28,Marianos,60615
1,Cilantro,$0.79,Fresh Herbs,,each,,2019-08-28,Marianos,60615
2,Italian Parsley,$0.99,Fresh Herbs,,each,,2019-08-28,Marianos,60615
3,Roundy's Organic Fresh Rosemary,$2.50,Fresh Herbs,,each,0.75 ounce,2019-08-28,Marianos,60615
4,Roundy's Organic Fresh Thyme,$2.50,Fresh Herbs,,each,0.75 ounce,2019-08-28,Marianos,60615
5,Roundy's Mint,$2.50,Fresh Herbs,,each,0.75 ounce,2019-08-28,Marianos,60615
6,Basil,$1.79,Fresh Herbs,,each,1 each,2019-08-28,Marianos,60615
7,Dill,$1.99,Fresh Herbs,,each,,2019-08-28,Marianos,60615
8,Roundy's Organics Fresh Dill,$2.50,Fresh Herbs,,each,1 bunch,2019-08-28,Marianos,60615
9,Gourmet Garden™ Ginger Stir-in Paste,$4.49,Fresh Herbs,,each,4 ounce,2019-08-28,Marianos,60615


In [8]:
grocery_prices_marianos['product'].nunique()

5851

In [9]:
products_list = list(grocery_prices_marianos['product'])

In [19]:
products_list_new = []
for i in products_list:
    word = i.replace("'", "")
    products_list_new.append(word.replace(".", ""))    

In [20]:
# let's tokenize all the words and get rid of punctuation
tokenizer = RegexpTokenizer(r'(\d\/\d |\w+)')
token_sr = []
for product in products_list_new:
    token_sr.append(tokenizer.tokenize(product))

In [21]:
token_sr

[['Green', 'Onions', 'Scallions'],
 ['Cilantro'],
 ['Italian', 'Parsley'],
 ['Roundys', 'Organic', 'Fresh', 'Rosemary'],
 ['Roundys', 'Organic', 'Fresh', 'Thyme'],
 ['Roundys', 'Mint'],
 ['Basil'],
 ['Dill'],
 ['Roundys', 'Organics', 'Fresh', 'Dill'],
 ['Gourmet', 'Garden', 'Ginger', 'Stir', 'in', 'Paste'],
 ['Organic', 'Chives'],
 ['Simple', 'Truth', 'Organic', 'Ithyme', 'Leaves'],
 ['Organic', 'Curly', 'Parsley'],
 ['Roundys', 'Organics', 'Sage'],
 ['Roundys', 'Bay', 'Leaf'],
 ['Bellino', 'Peeled', 'Garlic', 'Cloves'],
 ['Gourmet', 'Garden', 'Lemongrass', 'Stir', 'in', 'Paste'],
 ['Organic', 'Fennel', 'Bulb'],
 ['Gourmet', 'Garden', 'Lightly', 'Dried', 'Parsley'],
 ['Organic', 'Curly', 'Parsley', 'Bunch'],
 ['McCormick', 'Thyme', 'Leaves'],
 ['Roundys', 'Organics', 'Fresh', 'Parsley'],
 ['Simple', 'Truth', 'Organic', 'Organic', 'Oregano', 'Leaves'],
 ['Roundys', 'Ground', 'Ginger'],
 ['Gourmet', 'Garden', 'Cilantro', 'Stir', 'in', 'Paste'],
 ['Spice', 'Islands', 'Thyme'],
 ['Ntwy', '

In [22]:
crf_data  = []
for product in token_sr:
    pos = nltk.pos_tag(product)
    crf_data.append(pos)

In [24]:
len(products_list_new)

6242

In [25]:
len(crf_data)

6242

In [26]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [27]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

In [28]:
X = [extract_features(doc) for doc in crf_data]

In [29]:
X

[[['bias',
   'word.lower=green',
   'word[-3:]=een',
   'word[-2:]=en',
   'postag=JJ',
   'BOS',
   '+1:word.lower=onions',
   '+1:postag=NNP'],
  ['bias',
   'word.lower=onions',
   'word[-3:]=ons',
   'word[-2:]=ns',
   'postag=NNP',
   '-1:word.lower=green',
   '-1:postag=JJ',
   '+1:word.lower=scallions',
   '+1:postag=NNS'],
  ['bias',
   'word.lower=scallions',
   'word[-3:]=ons',
   'word[-2:]=ns',
   'postag=NNS',
   '-1:word.lower=onions',
   '-1:postag=NNP',
   'EOS']],
 [['bias',
   'word.lower=cilantro',
   'word[-3:]=tro',
   'word[-2:]=ro',
   'postag=NN',
   'BOS',
   'EOS']],
 [['bias',
   'word.lower=italian',
   'word[-3:]=ian',
   'word[-2:]=an',
   'postag=JJ',
   'BOS',
   '+1:word.lower=parsley',
   '+1:postag=NNP'],
  ['bias',
   'word.lower=parsley',
   'word[-3:]=ley',
   'word[-2:]=ey',
   'postag=NNP',
   '-1:word.lower=italian',
   '-1:postag=JJ',
   'EOS']],
 [['bias',
   'word.lower=roundys',
   'word[-3:]=dys',
   'word[-2:]=ys',
   'postag=NNP',
   'BO

In [30]:
tagger = pycrfsuite.Tagger()
tagger.open('../../data/04_models/crf_marianos_final.model')

<contextlib.closing at 0x1a2a1d09b0>

In [31]:
y_pred = [tagger.tag(xseq) for xseq in X]

In [32]:
len(token_sr)

6242

In [33]:
len(y_pred)

6242

In [34]:
def product_tagger(product_sentence_tokens, product_label_sentence):
    pre = []
    food = []
    post = []

    for word, label in zip(product_sentence_tokens, product_label_sentence):
        if label == 'pre':
            pre.append(word.lower())
        if label == 'food':
            food.append(word.lower())
        if label == 'post':
            post.append(word.lower())
    return {'pre': " ".join(pre), 'food': " ".join(food), 'post': " ".join(post)}

In [35]:
def token_labels_to_dict(tokens, labels, titles):
    final_dict = {}
    for token, label, title in zip(tokens, labels, titles):
        prod = product_tagger(token, label)
        final_dict[str(title).lower()] = prod
    return final_dict

In [36]:
final_dict = token_labels_to_dict(token_sr, y_pred, products_list)

In [37]:
final_dict

{'green onions (scallions)': {'pre': 'green onions',
  'food': 'scallions',
  'post': ''},
 'cilantro': {'pre': '', 'food': 'cilantro', 'post': ''},
 'italian parsley': {'pre': 'italian', 'food': 'parsley', 'post': ''},
 "roundy's organic fresh rosemary": {'pre': '',
  'food': 'roundys',
  'post': 'organic fresh rosemary'},
 "roundy's organic fresh thyme": {'pre': 'roundys organic fresh',
  'food': 'thyme',
  'post': ''},
 "roundy's mint": {'pre': 'roundys', 'food': 'mint', 'post': ''},
 'basil': {'pre': '', 'food': 'basil', 'post': ''},
 'dill': {'pre': '', 'food': 'dill', 'post': ''},
 "roundy's organics fresh dill": {'pre': 'roundys organics fresh',
  'food': 'dill',
  'post': ''},
 'gourmet garden™ ginger stir-in paste': {'pre': 'gourmet garden ginger stir in',
  'food': 'paste',
  'post': ''},
 'organic chives': {'pre': 'organic', 'food': 'chives', 'post': ''},
 'simple truth organic ithyme leaves': {'pre': 'simple truth organic',
  'food': 'ithyme',
  'post': 'leaves'},
 'organic

In [38]:
def food_label_finder(token_list, label_list):
    pre = []
    food = []
    post = []

    for word, label in zip(token_list, label_list):
        if label == 'pre':
            pre.append(word.lower())
        if label == 'food':
            food.append(word.lower())
        if label == 'post':
            post.append(word.lower())
    return " ".join(food)

In [39]:
prod_list_new = []
for token, label in zip(token_sr, y_pred):
    prod_list_new.append(food_label_finder(token, label))

In [42]:
print(len(prod_list_new))
print(len(grocery_prices_marianos))

6242
6242


In [40]:
prod_list_new

['scallions',
 'cilantro',
 'parsley',
 'roundys',
 'thyme',
 'mint',
 'basil',
 'dill',
 'dill',
 'paste',
 'chives',
 'ithyme',
 'parsley',
 'sage',
 'leaf',
 'garlic',
 'paste',
 'fennel',
 'dried parsley',
 'parsley',
 'thyme',
 'parsley',
 'oregano',
 'ground ginger',
 'paste',
 'thyme',
 'dong',
 'lemon basil',
 'cocoa',
 'baking soda',
 'chocolate',
 'pumpkin',
 'cocoa',
 'pure vanilla extract',
 'baking chips',
 'stevia',
 'splenda',
 'evaporated milk',
 'flour bread',
 'dark chocolate',
 'flour',
 'light brown sugar',
 'chocolate',
 'sugar',
 'sugar',
 'vanilla extract',
 'granulated no calorie sweetener',
 'flour',
 'vanilla extract',
 'sliced almonds',
 'activedry yeast',
 'sugar',
 'milk chocolate',
 'pumpkin',
 'baking cocoa',
 'stevia',
 'walnut',
 'truvia',
 'pie crust',
 'sugar',
 'chocolate milk',
 'chocolate',
 'condensed milk',
 'powdered sugar',
 'sugar',
 'cake mix',
 'pudding',
 'pine nuts',
 'walnut',
 'all',
 'baking powder',
 'equal',
 'certified',
 'kroger swe

In [43]:
grocery_prices_marianos['product_new'] = prod_list_new

In [44]:
grocery_prices_marianos

Unnamed: 0,product,main_price,prod_aile,price_per_lb,measure_words_main_price,item_weight_count_vol,date_collected,store,location,product_new
0,Green Onions (Scallions),$0.99,Fresh Herbs,,each,,2019-08-28,Marianos,60615,scallions
1,Cilantro,$0.79,Fresh Herbs,,each,,2019-08-28,Marianos,60615,cilantro
2,Italian Parsley,$0.99,Fresh Herbs,,each,,2019-08-28,Marianos,60615,parsley
3,Roundy's Organic Fresh Rosemary,$2.50,Fresh Herbs,,each,0.75 ounce,2019-08-28,Marianos,60615,roundys
4,Roundy's Organic Fresh Thyme,$2.50,Fresh Herbs,,each,0.75 ounce,2019-08-28,Marianos,60615,thyme
5,Roundy's Mint,$2.50,Fresh Herbs,,each,0.75 ounce,2019-08-28,Marianos,60615,mint
6,Basil,$1.79,Fresh Herbs,,each,1 each,2019-08-28,Marianos,60615,basil
7,Dill,$1.99,Fresh Herbs,,each,,2019-08-28,Marianos,60615,dill
8,Roundy's Organics Fresh Dill,$2.50,Fresh Herbs,,each,1 bunch,2019-08-28,Marianos,60615,dill
9,Gourmet Garden™ Ginger Stir-in Paste,$4.49,Fresh Herbs,,each,4 ounce,2019-08-28,Marianos,60615,paste


In [45]:
grocery_prices_marianos.to_csv('../../data/03_processed/grocery_prices_marianos_final.csv', index=False)