# Using model to detect ingredient list section from post

Applying NER model to posts and then using heuristics to identify the ingredients list section from the post.

In [4]:
import pandas as pd
import numpy as np
import spacy

In [5]:
posts = pd.read_csv('preprocessed_descriptions.csv')

In [66]:
nlp = spacy.load("./model-best")

# Merging entities into single tokens
nlp.add_pipe("merge_entities")

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [70]:
doc = nlp(posts['description_preprocessed'][3])
colors = {
    "MEASUREMENT": "#bfeeb7",
    "INGREDIENT": "#feca74",
    "QUANTITY": "#e4e7d2"
}
options = {"ents": list(colors), "colors": colors}
spacy.displacy.render(doc, style="ent", options=options, jupyter=True)

In [71]:
token_types = []
for tok in doc:
    token_types.append(tok.ent_type_)
token_types

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'INGREDIENT',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'INGREDIENT',
 'INGREDIENT',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'QUANTITY',
 'MEASUREMENT',
 '',
 '',
 'INGREDIENT',
 '',
 '',
 '',
 '',
 'MEASUREMENT',
 '',
 '',
 'INGREDIENT',
 '',
 'QUANTITY',
 'MEASUREMENT',
 'INGREDIENT',
 '',
 'QUANTITY',
 'MEASUREMENT',
 'INGREDIENT',
 '',
 'QUANTITY',
 'MEASUREMENT',
 'INGREDIENT',
 '',
 'QUANTITY',
 'MEASUREMENT',
 '',
 'QUANTITY',
 'MEASUREMENT',
 '',
 'INGREDIENT',
 '',
 '',
 '',
 '',
 '',
 '',
 'QUANTITY',
 'MEASUREMENT',
 '',
 'INGREDIENT',
 '',
 '',
 '',
 'QUANTITY',
 '',
 'INGREDIENT',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'INGREDIENT',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'INGREDIENT',
 '',
 'INGREDIENT'

In [72]:
ing_start_index = []
for index in range(len(token_types)-2):
    
    if token_types[index]=='QUANTITY' and token_types[index+1]=='MEASUREMENT' and token_types[index+2]=='INGREDIENT':
        ing_start_index.append(index)
ing_start_index

[78, 82, 86]

In [73]:
for i in ing_start_index:
    print(doc[i:i+3])

20 gram sugar
6 gram baking powder
1/2 teaspoon salt


In [114]:
def get_ingredient_triplets(text):
    
    '''
    Function to identify entity triplets which relate to unique ingredients.
    These triplets come in the form consecutive QUANTITY, MEASUREMENT, INGREDIENT entities.
    '''
    
    doc = nlp(text)
    ingredient_triplets = []
    
    for i in range(len(doc)-10):
        
        if doc[i].ent_type_ == 'QUANTITY' and doc[i+1].ent_type_ == 'MEASUREMENT':
            
            quantity_index = i
            measurement_index = i+1
                
            for j in range(i+1,i+6):

                if doc[j].ent_type_ == 'INGREDIENT':

                    ingredient_index = j
                    ingredient_triplets.append((quantity_index,measurement_index,ingredient_index))
                    break
                    
                    
    def ingredient_triplets_to_list(triplets):

        dict_list = []

        for i, j, k in triplets:

            ing_dict = {}

            ing_dict['ingredient'] = doc[k]
            ing_dict['quantity'] = doc[i]
            ing_dict['measurement'] = doc[j]

            dict_list.append(ing_dict)

        return dict_list
        
    return ingredient_triplets_to_list(ingredient_triplets)

In [115]:
get_ingredient_triplets(doc)

[{'ingredient': mushrooms, 'quantity': 200, 'measurement': gram},
 {'ingredient': sugar, 'quantity': 20, 'measurement': gram},
 {'ingredient': baking powder, 'quantity': 6, 'measurement': gram},
 {'ingredient': salt, 'quantity': 1/2, 'measurement': teaspoon},
 {'ingredient': butter, 'quantity': 55, 'measurement': gram},
 {'ingredient': egg, 'quantity': 25, 'measurement': gram}]

In [116]:
posts['ingredient_list'] = posts['description_preprocessed'].apply(get_ingredient_triplets)

In [122]:
posts['ingredient_list']

0      [{'ingredient': oats, 'quantity': 1/2, 'measur...
1      [{'ingredient': pesto, 'quantity': 2, 'measure...
2      [{'ingredient': ginger, 'quantity': 1, 'measur...
3      [{'ingredient': mushrooms, 'quantity': 200, 'm...
4      [{'ingredient': flour, 'quantity': 1/2, 'measu...
                             ...                        
328    [{'ingredient': prawns, 'quantity': 90, 'measu...
329    [{'ingredient': rice, 'quantity': 1, 'measurem...
330    [{'ingredient': pasta, 'quantity': 200, 'measu...
331    [{'ingredient': cheddar, 'quantity': 1/4, 'mea...
332    [{'ingredient': cheddar, 'quantity': 1/4, 'mea...
Name: ingredient_list, Length: 333, dtype: object