# Using model to detect ingredient list section from post

Applying NER model to posts and then using heuristics to identify the ingredients list section from the post.

In [1]:
import pandas as pd
import numpy as np
import spacy

import sys
sys.path.append("/Users/maxkirwan/Desktop/Uni/Data Science MSc/Data Science Project/nutrition-insta")
import functions

In [2]:
posts1 = pd.read_csv("/Users/maxkirwan/Desktop/Uni/Data Science MSc/Data Science Project/nutrition-insta/Instagram Data Scraping/Phantom Buster/recipe_posts.csv")
posts2 = pd.read_csv("/Users/maxkirwan/Desktop/Uni/Data Science MSc/Data Science Project/nutrition-insta/Instagram Data Scraping/Phantom Buster/recipe_posts_2.csv")
posts3 = pd.read_csv("/Users/maxkirwan/Desktop/Uni/Data Science MSc/Data Science Project/nutrition-insta/Instagram Data Scraping/Phantom Buster/recipe_posts_3.csv")

posts = pd.concat([posts1,posts2,posts3])

In [3]:
# Getting english posts
posts = functions.get_english_posts(posts)

Detecting language of each post...
Language detection complete.
Time taken: 0:07:40.709580


In [6]:
# Preprocessing text descriptions
posts['description_preprocessed'] = posts['description'].apply(functions.preprocess_text)

In [8]:
def includes_ingredient_list(text):
    
    text_str = str(text)
    if ("ingredients:" in text or "ingredients :" in text) and text_str.count('\n') > 8:
        return True
        
    else:
        return False
    
# Adding includes_ingredient_list binary column
posts['includes_ingredient_list'] = posts['description_preprocessed'].apply(includes_ingredient_list)
# Keeping only posts with ingredient_list
posts = posts[posts['includes_ingredient_list']]

In [None]:
posts = pd.read_csv('preprocessed_descriptions.csv')

In [4]:
nlp = spacy.load("./model-best")

# Merging entities into single tokens
nlp.add_pipe("merge_entities")

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [10]:
doc = nlp(posts['description_preprocessed'][3])
colors = {
    "MEASUREMENT": "#bfeeb7",
    "INGREDIENT": "#feca74",
    "QUANTITY": "#e4e7d2"
}
options = {"ents": list(colors), "colors": colors}
spacy.displacy.render(doc, style="ent", options=options, jupyter=True)

In [30]:
def get_ingredient_triplets(text):
    
    '''
    Function to identify entity triplets which relate to unique ingredients.
    These triplets come in the form consecutive QUANTITY, MEASUREMENT, INGREDIENT entities.
    '''
    
    doc = nlp(text)
    ingredient_triplets = []
    
    for i in range(len(doc)-10):
        
        if doc[i].ent_type_ == 'QUANTITY' and doc[i+1].ent_type_ == 'MEASUREMENT':
            
            quantity_index = i
            measurement_index = i+1
                
            for j in range(i+1,i+8):

                if doc[j].ent_type_ == 'INGREDIENT':

                    ingredient_index = j
                    ingredient_triplets.append((quantity_index,measurement_index,ingredient_index))
                    break
                    
                    
    def ingredient_triplets_to_list(triplets):

        dict_list = []

        for i, j, k in triplets:

            ing_dict = {}

            ing_dict['ingredient'] = doc[k]
            ing_dict['quantity'] = doc[i]
            ing_dict['measurement'] = doc[j]

            dict_list.append(ing_dict)

        return dict_list
        
    return ingredient_triplets_to_list(ingredient_triplets)

In [31]:
get_ingredient_triplets(doc)

[{'ingredient': oil, 'quantity': 1/4, 'measurement': cup},
 {'ingredient': milk, 'quantity': 1/4, 'measurement': cup},
 {'ingredient': sugar, 'quantity': 1/2, 'measurement': cup},
 {'ingredient': vanilla, 'quantity': 2, 'measurement': teaspoon},
 {'ingredient': flour, 'quantity': 2, 'measurement': cup},
 {'ingredient': cinnamon, 'quantity': 1, 'measurement': teaspoon},
 {'ingredient': baking soda, 'quantity': 1, 'measurement': teaspoon},
 {'ingredient': salt, 'quantity': 1/2, 'measurement': teaspoon}]

In [32]:
# Getting ingredient triplets for all posts
posts['ingredient_list'] = posts['description_preprocessed'].apply(get_ingredient_triplets)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts['ingredient_list'] = posts['description_preprocessed'].apply(get_ingredient_triplets)


In [33]:
for i in posts['ingredient_list'][0:5]:
    print(i,"\n\n")

[{'ingredient': oats, 'quantity': 1/2, 'measurement': cup}, {'ingredient': milk, 'quantity': 1/2, 'measurement': cup}, {'ingredient': water, 'quantity': 1/2, 'measurement': cup}, {'ingredient': sugar, 'quantity': 1, 'measurement': teaspoon}, {'ingredient': peanut butter, 'quantity': 1, 'measurement': tablespoon}, {'ingredient': banana, 'quantity': 1, 'measurement': tablespoon}] 


[{'ingredient': pesto, 'quantity': 2, 'measurement': teaspoon}] 


[{'ingredient': ginger, 'quantity': 1, 'measurement': tablespoon}, {'ingredient': tamari, 'quantity': 6, 'measurement': tablespoon}, {'ingredient': hemp seeds, 'quantity': 1, 'measurement': tablespoon}, {'ingredient': water, 'quantity': 2, 'measurement': tablespoon}, {'ingredient': oil, 'quantity': 1, 'measurement': tablespoon}] 


[{'ingredient': mushrooms, 'quantity': 200, 'measurement': gram}, {'ingredient': sugar, 'quantity': 20, 'measurement': gram}, {'ingredient': baking powder, 'quantity': 6, 'measurement': gram}, {'ingredient': salt, '

In [34]:
# Average length of ingredient list
np.mean([len(ing_list) for ing_list in list(posts['ingredient_list'])])

4.017964071856287

In [35]:
# Number of empty ingredient lists
[len(ing_list) for ing_list in list(posts['ingredient_list'])].count(0)

59

### Getting list of ingredients

In [41]:
def get_ingredients(ing_list):
    return [ing['ingredient'] for ing in ing_list]

In [43]:
# Get list of ingredients for all posts
posts['ingredients'] = posts['ingredient_list'].apply(get_ingredients)

In [44]:
posts

Unnamed: 0,postUrl,profileUrl,username,fullName,commentCount,likeCount,pubDate,description,location,imgUrl,...,isSidecar,sidecarMedias,videoUrl,viewCount,language,score,description_preprocessed,includes_ingredient_list,ingredient_list,ingredients
81,https://www.instagram.com/p/Cgd_iR_vT2d/,https://www.instagram.com/all.about.oats,all.about.oats,Anushka Lodhi,0,2,2022-07-26T08:23:14.000Z,Chocolate fudge protein oatmeal🤎 💪\nIngredient...,"Ghaziabad, India",https://scontent-lhr8-2.cdninstagram.com/v/t51...,...,True,3.0,,,en,0.999996,chocolate fudge protein oatmeal \ningredients:...,True,"[{'ingredient': oats, 'quantity': 1/2, 'measur...","[oats, milk, water, sugar, peanut butter, banana]"
104,https://www.instagram.com/p/Cgd8bLXDaCq/,https://www.instagram.com/a_m_eats,a_m_eats,Alice & Meg 🍴,3,12,2022-07-26T07:56:03.000Z,⁣Caprese Chicken with Pesto 🌿🍅⠀\n⠀\nThis dish ...,"Glasgow, United Kingdom",https://scontent-lhr8-1.cdninstagram.com/v/t51...,...,True,3.0,,,en,0.999997,⁣caprese chicken with pesto ⠀\n⠀\nthis dish is...,True,"[{'ingredient': pesto, 'quantity': 2, 'measure...",[pesto]
169,https://www.instagram.com/p/CgdylqtOy9s/,https://www.instagram.com/hescottwellness,hescottwellness,"Natasha Hescott, RDN, CDN",0,1,2022-07-26T06:30:06.000Z,Looking for a fast recipe to make for lunch? T...,,https://scontent-lhr8-1.cdninstagram.com/v/t51...,...,False,,,,en,0.999996,looking for a fast recipe to make for lunch? t...,True,"[{'ingredient': ginger, 'quantity': 1, 'measur...","[ginger, tamari, hemp seeds, water, oil]"
172,https://www.instagram.com/p/CgdyENyLgmv/,https://www.instagram.com/rainbowpiatto,rainbowpiatto,Rainbow Piatto,1,14,2022-07-26T06:25:32.000Z,Sweet & Savoury Mushroom Scones~🍄 This easy an...,Singapore / Singapura / 新加坡 / சிங்கப்பூர்,https://scontent-lhr8-1.cdninstagram.com/v/t51...,...,False,,,,en,0.999998,sweet & savoury mushroom scones~ this easy and...,True,"[{'ingredient': mushrooms, 'quantity': 200, 'm...","[mushrooms, sugar, baking powder, salt, butter..."
226,https://www.instagram.com/p/CgdpvTNvrWt/,https://www.instagram.com/foodiesfood_court,foodiesfood_court,Foodies Food Court,43,1819,2022-07-26T05:12:47.000Z,Paneer Tikka Recipe😍\nSave it to try later♥️\n...,Bihar,https://scontent-lhr8-2.cdninstagram.com/v/t51...,...,True,2.0,,,en,0.999996,paneer tikka recipe\nsave it to try later\n\ni...,True,"[{'ingredient': flour, 'quantity': 1/2, 'measu...","[flour, carom seeds, ginger, red mirch, cumin ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3208,https://www.instagram.com/p/CgCX2p_DSsu/,https://www.instagram.com/alwayshungryinlondon,alwayshungryinlondon,𝐇𝐚𝐧𝐧𝐚𝐡 𝐃𝐉,111,6861,2022-07-15T14:57:00.000Z,Prawn Taco Bowl\n—————————————————————-\nPerfe...,"London, United Kingdom",https://scontent-cdg2-1.cdninstagram.com/v/t51...,...,False,,,,en,0.999997,prawn taco bowl\n—————————————————————-\nperfe...,True,"[{'ingredient': prawns, 'quantity': 90, 'measu...","[prawns, garlic powder, chilli, coriander, ric..."
3389,https://www.instagram.com/p/ChFyI8OLgWy/,https://www.instagram.com/everyday_homecooking,everyday_homecooking,JESS WHARTON | FOODIE,2,23,2022-08-10T19:15:48.000Z,"SALMON, BUTTERNUT SQUASH, SWEET POTATO AND KAL...",,https://scontent-cdt1-1.cdninstagram.com/v/t51...,...,False,,,,en,0.999995,"salmon, butternut squash, sweet potato and kal...",True,"[{'ingredient': rice, 'quantity': 1, 'measurem...","[rice, paprika, paprika, garlic powder, turmer..."
3463,https://www.instagram.com/p/ChFdeY_P89I/,https://www.instagram.com/masterclassuk,masterclassuk,MasterClass,1,11,2022-08-10T16:15:14.000Z,Are you feeling lazy? We have just the recipe ...,,https://scontent-cdt1-1.cdninstagram.com/v/t51...,...,False,,,,en,0.999995,are you feeling lazy? we have just the recipe ...,True,"[{'ingredient': pasta, 'quantity': 200, 'measu...","[pasta, salt, bread, olive, butter, hazelnuts,..."
3556,https://www.instagram.com/p/ChFKUXyoR-t/,https://www.instagram.com/ketoguide24,ketoguide24,keto diet | meal plan,2,21,2022-08-10T13:27:50.000Z,🍽 Servings: 1\n⠀ ⠀\nIngredients:\n⠀\n1/4 cup s...,USA,https://scontent-cdt1-1.cdninstagram.com/v/t51...,...,False,,,,en,0.999998,servings: 1 \n⠀ ⠀\ningredients:\n⠀\n1/4 cup sh...,True,"[{'ingredient': cheddar, 'quantity': 1/4, 'mea...","[cheddar, flour]"


## Using USDA API

In [47]:
import requests

headers = {
    # Already added when you pass json= but not when you pass data=
    # 'Content-Type': 'application/json',
}

params = {
    'api_key': 'XBZBkznaG3lFCfED92u489nCZFlm9zdBNdH6lCEZ',
}

json_data = {
    'query': 'Cheddar cheese',
    'dataType': [
        'Branded',
    ],
    'sortBy': 'fdcId',
    'sortOrder': 'desc',
}

response = requests.post('https://api.nal.usda.gov/fdc/v1/foods/list', params=params, headers=headers, json=json_data)

In [80]:
json_data = {
    'query': 'oats',
    'dataType': [
        'Foundation',
        'SR Legacy'
    ],
    'pageSize' : 1,
    'pageNumber' : 1,
    'sortBy': 'fdcId',
    'sortOrder': 'desc',
}

response = requests.post('https://api.nal.usda.gov/fdc/v1/foods/list', params=params, headers=headers, json=json_data)

In [81]:
print(len([nutrient['name'] for nutrient in response.json()[0]['foodNutrients']]))
[nutrient['name'] for nutrient in response.json()[0]['foodNutrients']]

58


['Thiamin',
 'Riboflavin',
 'Niacin',
 'Vitamin B-6',
 'Biotin',
 'Folate, total',
 'Vitamin B-12',
 'Water',
 'Galactose',
 'Fiber, total dietary',
 'Tryptophan',
 'Threonine',
 'Isoleucine',
 'Leucine',
 'Lysine',
 'Methionine',
 'Calcium, Ca',
 'Iron, Fe',
 'Phenylalanine',
 'Tyrosine',
 'Magnesium, Mg',
 'Valine',
 'Phosphorus, P',
 'Arginine',
 'Potassium, K',
 'Histidine',
 'Sodium, Na',
 'Alanine',
 'Zinc, Zn',
 'Aspartic acid',
 'Glutamic acid',
 'Glycine',
 'Copper, Cu',
 'Proline',
 'Serine',
 'Hydroxyproline',
 'Manganese, Mn',
 'Molybdenum, Mo',
 'Selenium, Se',
 'Cysteine',
 'Retinol',
 'Vitamin D2 (ergocalciferol)',
 'Vitamin D3 (cholecalciferol)',
 'Nitrogen',
 'Total lipid (fat)',
 'Ash',
 'Sucrose',
 'Glucose',
 'Fructose',
 'Lactose',
 'Maltose',
 'Protein',
 'Carbohydrate, by difference',
 'Vitamin A',
 'Vitamin D (D2 + D3)',
 'Energy (Atwater General Factors)',
 'Vitamin D (D2 + D3), International Units',
 'Sugars, Total NLEA']

In [82]:
def get_usda_info_json(ingredient):
    
    headers = {}
    params = {'api_key': 'XBZBkznaG3lFCfED92u489nCZFlm9zdBNdH6lCEZ'}
    
    json_data = {
        'query': ingredient,
        'dataType': [
            'Foundation',
            'SR Legacy'
        ],
        'pageSize' : 1,
        'pageNumber' : 1,
        'sortBy': 'fdcId',
        'sortOrder': 'desc',
    }

    response = requests.post('https://api.nal.usda.gov/fdc/v1/foods/list', params=params, headers=headers, json=json_data)
    
    return response.json()[0]

In [85]:
get_usda_info_json('peanut butter')

{'fdcId': 2262074,
 'description': 'Almond butter, creamy',
 'dataType': 'Foundation',
 'publicationDate': '2022-04-28',
 'ndbNumber': '12195',
 'foodNutrients': [{'number': '717',
   'name': 'Daidzin',
   'amount': 0.0281,
   'unitName': 'MG',
   'derivationCode': 'A',
   'derivationDescription': 'Analytical'},
  {'number': '718',
   'name': 'Genistin',
   'amount': 0.0,
   'unitName': 'MG',
   'derivationCode': 'A',
   'derivationDescription': 'Analytical'},
  {'number': '719',
   'name': 'Glycitin',
   'amount': 0.172,
   'unitName': 'MG',
   'derivationCode': 'A',
   'derivationDescription': 'Analytical'},
  {'number': '645',
   'name': 'Fatty acids, total monounsaturated',
   'amount': 34.7,
   'unitName': 'G',
   'derivationCode': 'A',
   'derivationDescription': 'Analytical'},
  {'number': '646',
   'name': 'Fatty acids, total polyunsaturated',
   'amount': 12.6,
   'unitName': 'G',
   'derivationCode': 'A',
   'derivationDescription': 'Analytical'},
  {'number': '404',
   'name