# Tagging Products - Instacart Kaggle Dataset

In [1]:
import argparse
import nltk
import pandas as pd
import pycrfsuite
import numpy as np
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/markishab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Read in Dataset and filter for only food

In [2]:
baskets = pd.read_csv('../../data/02_intermediate/baskets_spark.csv')
baskets.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
baskets.product_name.nunique()

24495

In [4]:
baskets.head()

Unnamed: 0,order_id,product_name,user_id,all_ones
0,2,Organic Egg Whites,202279,1
1,2,Michigan Organic Kale,202279,1
2,2,Garlic Powder,202279,1
3,2,Coconut Butter,202279,1
4,2,Natural Sweetener,202279,1


## Tagging Products through a CRF Model 

In [5]:
products_list = list(baskets.product_name)

In [6]:
# let's tokenize all the words and get rid of punctuation
tokenizer = RegexpTokenizer(r'(\d\/\d |\w+)')
token_sr = []
for product in products_list:
    token_sr.append(tokenizer.tokenize(product))

In [7]:
token_sr

[['Organic', 'Egg', 'Whites'],
 ['Michigan', 'Organic', 'Kale'],
 ['Garlic', 'Powder'],
 ['Coconut', 'Butter'],
 ['Natural', 'Sweetener'],
 ['Carrots'],
 ['Original', 'Unflavored', 'Gelatine', 'Mix'],
 ['All', 'Natural', 'No', 'Stir', 'Creamy', 'Almond', 'Butter'],
 ['Classic', 'Blend', 'Cole', 'Slaw'],
 ['Total', '2', 'with', 'Strawberry', 'Lowfat', 'Greek', 'Strained', 'Yogurt'],
 ['Unsweetened', 'Almondmilk'],
 ['Lemons'],
 ['Organic', 'Baby', 'Spinach'],
 ['Unsweetened', 'Chocolate', 'Almond', 'Breeze', 'Almond', 'Milk'],
 ['Organic', 'Ginger', 'Root'],
 ['Air', 'Chilled', 'Organic', 'Boneless', 'Skinless', 'Chicken', 'Breasts'],
 ['Organic', 'Ezekiel', '49', 'Bread', 'Cinnamon', 'Raisin'],
 ['Plain', 'Pre', 'Sliced', 'Bagels'],
 ['Oats', 'Chocolate', 'Chewy', 'Bars'],
 ['Kellogg', 's', 'Nutri', 'Grain', 'Apple', 'Cinnamon', 'Cereal'],
 ['Nutri',
  'Grain',
  'Soft',
  'Baked',
  'Strawberry',
  'Cereal',
  'Breakfast',
  'Bars'],
 ['Kellogg', 's', 'Nutri', 'Grain', 'Blueberry', 'C

### Feature Creation 

In [15]:
crf_data  = []
for product in token_sr:
    pos = nltk.pos_tag(product)
    crf_data.append(pos)

KeyboardInterrupt: 

In [15]:
len(products_list)

24890363

In [16]:
len(crf_data)

24890363

In [20]:
 def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [21]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

In [22]:
X = [extract_features(doc) for doc in crf_data]

In [23]:
X

[[['bias',
   'word.lower=organic',
   'word[-3:]=nic',
   'word[-2:]=ic',
   'postag=NNP',
   'BOS',
   '+1:word.lower=egg',
   '+1:postag=NNP'],
  ['bias',
   'word.lower=egg',
   'word[-3:]=Egg',
   'word[-2:]=gg',
   'postag=NNP',
   '-1:word.lower=organic',
   '-1:postag=NNP',
   '+1:word.lower=whites',
   '+1:postag=NNP'],
  ['bias',
   'word.lower=whites',
   'word[-3:]=tes',
   'word[-2:]=es',
   'postag=NNP',
   '-1:word.lower=egg',
   '-1:postag=NNP',
   'EOS']],
 [['bias',
   'word.lower=michigan',
   'word[-3:]=gan',
   'word[-2:]=an',
   'postag=NNP',
   'BOS',
   '+1:word.lower=organic',
   '+1:postag=NNP'],
  ['bias',
   'word.lower=organic',
   'word[-3:]=nic',
   'word[-2:]=ic',
   'postag=NNP',
   '-1:word.lower=michigan',
   '-1:postag=NNP',
   '+1:word.lower=kale',
   '+1:postag=NNP'],
  ['bias',
   'word.lower=kale',
   'word[-3:]=ale',
   'word[-2:]=le',
   'postag=NNP',
   '-1:word.lower=organic',
   '-1:postag=NNP',
   'EOS']],
 [['bias',
   'word.lower=garlic',

In [24]:
tagger = pycrfsuite.Tagger()
tagger.open('../../data/04_models/crf_instacart_products_final.model')

<contextlib.closing at 0x1dd6fd3d68>

In [25]:
y_pred = [tagger.tag(xseq) for xseq in X]

In [30]:
len(token_sr)

24890363

In [29]:
len(y_pred)

24890363

### Let's match up the tokens with their tags

Let's match up products with tags in a dictionary first. 

In [58]:
def product_tagger(product_sentence_tokens, product_label_sentence):
    pre = []
    food = []
    post = []

    for word, label in zip(product_sentence_tokens, product_label_sentence):
        if label == 'pre':
            pre.append(word.lower())
        if label == 'food':
            food.append(word.lower())
        if label == 'post':
            post.append(word.lower())
    return {'pre': " ".join(pre), 'food': " ".join(food), 'post': " ".join(post)}

In [59]:
def token_labels_to_dict(tokens, labels, titles):
    final_dict = {}
    for token, label, title in zip(tokens, labels, titles):
        prod = product_tagger(token, label)
        final_dict[str(title).lower()] = prod
    return final_dict

In [60]:
final_dict = token_labels_to_dict(token_sr, y_pred, products_list)

In [61]:
final_dict

{'organic egg whites': {'pre': 'organic', 'food': 'egg whites', 'post': ''},
 'michigan organic kale': {'pre': 'michigan organic',
  'food': 'kale',
  'post': ''},
 'garlic powder': {'pre': '', 'food': 'garlic powder', 'post': ''},
 'coconut butter': {'pre': '', 'food': 'coconut butter', 'post': ''},
 'natural sweetener': {'pre': '', 'food': 'natural sweetener', 'post': ''},
 'carrots': {'pre': '', 'food': 'carrots', 'post': ''},
 'original unflavored gelatine mix': {'pre': 'original unflavored',
  'food': 'gelatine mix',
  'post': ''},
 'all natural no stir creamy almond butter': {'pre': 'all',
  'food': 'natural',
  'post': 'no stir creamy almond butter'},
 'classic blend cole slaw': {'pre': 'classic blend',
  'food': 'cole slaw',
  'post': ''},
 'total 2% with strawberry lowfat greek strained yogurt': {'pre': 'total 2 with strawberry lowfat greek strained',
  'food': 'yogurt',
  'post': ''},
 'unsweetened almondmilk': {'pre': 'unsweetened',
  'food': 'almondmilk',
  'post': ''},
 'l

Let's make a new list of foods that we can replace the old list with. 

In [68]:
def food_label_finder(token_list, label_list):
    pre = []
    food = []
    post = []

    for word, label in zip(token_list, label_list):
        if label == 'pre':
            pre.append(word.lower())
        if label == 'food':
            food.append(word.lower())
        if label == 'post':
            post.append(word.lower())
    return " ".join(food)

In [69]:
prod_list_new = []
for token, label in zip(token_sr, y_pred):
    prod_list_new.append(food_label_finder(token, label))

In [70]:
prod_list_new

['egg whites',
 'kale',
 'garlic powder',
 'coconut butter',
 'natural sweetener',
 'carrots',
 'gelatine mix',
 'natural',
 'cole slaw',
 'yogurt',
 'almondmilk',
 'lemons',
 'spinach',
 'chocolate',
 'ginger',
 'chicken',
 'bread',
 'bagels',
 'oats',
 'kellogg',
 'breakfast bars',
 'cereal',
 'bananas',
 'just',
 'fruit salad',
 'raspberries',
 'milk',
 'tea',
 'chicken',
 'cheese',
 'macaroni and cheese',
 'clementines',
 'artichokes',
 'apricot preserves',
 'cheese',
 'avocado',
 'pasta',
 'lemon',
 'lemons',
 'pineapple',
 'sweet',
 'radish',
 'mushrooms',
 'spinach',
 'kefir',
 'applesauce',
 'green beans',
 'olive oil',
 'french baguettes',
 'bread',
 'almond',
 'cheese',
 'granola',
 'banana',
 'mushrooms',
 'cilantro',
 'avocado',
 'onions',
 'parsley',
 'strawberries',
 'beans',
 'green beans',
 'half half',
 'steak',
 'lettuce',
 'sunchoke',
 'chicken',
 'salsa',
 'beans',
 'olive oil',
 'coffee creamer',
 'milk',
 'heavenly',
 'whipped topping',
 'soup',
 'chicken',
 'pizz

In [71]:
baskets['new_prod_list'] = prod_list_new

In [74]:
len(baskets)

24890363

In [76]:
baskets.to_csv('../../data/05_model_output/baskets_newprodlist_2.csv', index=False)