# NLP and Nutition Information
This notebook is used to get nutrition information such as Fat, Calories, Sugar, Carbohydrates, and Protein.

In [1]:
# import dependencies
import pandas as pd
import numpy as np
import re
from re import search
from fractions import Fraction

# import dependencies
import requests
import json

# import API key
from config import API_KEY

In [48]:
# import the raw csv
recipes_df_raw = pd.read_csv('../static/data/technical_list.csv', encoding="utf-8")
recipes_df = recipes_df_raw.copy()
recipes_df_raw.head

<bound method NDFrame.head of                                                  title           judge  \
0                   Rav Gill’s Macaron Snack-Cessories    Ravneet Gill   
1    Liam Charles’s Biscuit Card Tower & Deck of Cards    Liam Charles   
2                 Rav Gill’s Cherry & Almond Deco Roll    Ravneet Gill   
3               Liam Charles’s Yorkshire Pudding Wraps    Liam Charles   
4          Rav Gill’s Churros with Banana Butterscotch    Ravneet Gill   
..                                                 ...             ...   
124                    Mary Berry’s Hazelnut Dacquoise  Paul Hollywood   
125                           Mary Berry’s Religieuses  Paul Hollywood   
126                  Paul Hollywood’s Apricot Couronne  Paul Hollywood   
127          Mary Berry’s Tuiles with Chocolate Mousse  Paul Hollywood   
128                      Mary Berry’s Charlotte Royale  Paul Hollywood   

            makes              difficulty hands_on_time baking_time  \
0         

In [3]:
# Formats for ingredients
# 1g unsalted butter
# 1 tsp vanilla extract
# slivered pistachios
# 2 large eggs
# 2 dill pickles, quartered lengthways
# juice of 1/2 lemon

def split_ingredient(ingredient):
    '''
    The split_ingredients funcion takes an ingredient in a string format i.e. 10g sugar. The function then splits the ingredient into quantity, measurement, and any comments like temperature, or preparation.
    '''
    # create variable defaults
    comment = 'na'
    quantity = '0'
    measurement = 'na'
    item = 'na'

    # if the ingredient contains a comment
    if ',' in ingredient:
        comment = ingredient.split(',')[1]
        ingredient = ingredient.split(',')[0]

    # if measurement is weight or volume (works for g or ml)
    elif re.match('^\d+[a-zA-Z]',ingredient) is not None:
        quantity = re.findall('\d+', ingredient)[0]
        measurement = re.findall('^\d+([a-zA-Z]+) ', ingredient)[0]
        item = re.findall('\d+[a-zA-Z]+ (.*)', ingredient)[0]

    # if ingredient is eggs
    elif 'egg' in ingredient:
        quantity = re.findall('\d+', ingredient)[0]
        measurement = re.findall('\d+(.*) ', ingredient)[0]
        item = 'egg'

    # Juice of
    elif 'juice of' in ingredient:
        item = re.findall('juice of [^a-z]+([a-z ]+)', ingredient, re.IGNORECASE)[0]
        if 'zest' in ingredient:
            comment = 'juice and zest'
        else:
            comment = 'juice'
        if re.findall('[0-9]+', ingredient) is not None:
            quantity = re.findall('[0-9]+', ingredient)[0]
        else:
            quantity = '1'


    # if ingredient uses tsp or tbsp
    elif 'tbsp' in ingredient: 
        try:
            quantity = re.findall('^([0-9]+/*[0-9]*)', ingredient)[0]
        except IndexError:
            quantity = "0.5"
        measurement = 'tbsp'
        item = re.findall('tbsp (.*)', ingredient)[0]

    elif 'tsp' in ingredient:
        try:
            quantity = re.findall('^([0-9]+/*[0-9]*)', ingredient)[0]
        except IndexError:
            quantity = "0.5"
        measurement = 'tsp'
        item = re.findall('tsp (.*)', ingredient)[0]
    
    # if it just a number and ingredient (2 pickles)
    elif re.match('^[0-9]+/*[0-9]* ', ingredient) is not None:
        quantity = re.findall('^([0-9]+/*[0-9]*)', ingredient)[0]
        measurement = "whole"
        item = re.findall('^[0-9]+/*[0-9]* (.*)', ingredient)[0]
    
    # a pinch of something
    elif search('pinch', ingredient) is not None:
        measurement = re.findall('(.*pinch)', ingredient)[0]
        item = re.findall('.*pinch of (.*)', ingredient)[0]
        if re.match('^[0-9]', ingredient) is not None:
            quantity = re.findall('^([0-9])', ingredient)[0]

    # Other
    else:
        comment = "Could not parse ingredient"
        # format into dict
        ingredient_list_split = {'quantity':quantity, 'measurement':measurement, 'ingredient':item,'comment':comment}
        return ingredient_list_split

    # make sure the quantity is the right data type
    if '/' in quantity:
        quantity = Fraction(quantity)
    
    # format into dict
    ingredient_list_split = {'quantity':float(quantity), 'measurement':measurement, 'ingredient':item, 'comment':comment}
    
    return ingredient_list_split

In [4]:
def seperate_ingredients(ingredient_string):
    '''
    The seperate_ingredients funcion takes an ingredient list with each ingredient inside a <p></p> tag and returns a list containing all the ingredients in a string format.
    '''
    ingredient_list = re.findall('<p>(.*?)</p>', ingredient_string)
    
    return ingredient_list

In [5]:
# test of the split ingredient function
test_ingredient = '1/2 lime'
test_ingredient_split = (split_ingredient(test_ingredient))
test_item = test_ingredient_split['ingredient']
print(test_ingredient_split)

{'quantity': 0.5, 'measurement': 'whole', 'ingredient': 'lime', 'comment': 'na'}


# Nutrtion API
The API used is the Food Data Central Nutrition API from the FDA. 

In [6]:
def get_fdcid (food_item, api_key = "DEMO_KEY"):
    '''
    Pass in a string of the ingredient requested (i.e. "whole milk") and the API key (defaults to the API demo key). The function then returns the fdcid of the ingredient by using the foods/search endpoint. The function uses the Foundation data type and select the first item found. 
    '''

    # create base URL
    search_request_url = 'https://api.nal.usda.gov/fdc/v1/search?api_key='

    # make call
    response = requests.get(f'{search_request_url}{api_key}&query={food_item}&datatype=Foundation')

    # parse the returned JSON
    parsed = response.json()

    # Select first fdcId
    fdcId = parsed['foods'][0]['fdcId']

    return(fdcId)


In [7]:
def get_macros(item_fdcid, api_key = "DEMO_KEY"):
    '''
    This function takes in an fdcid and an API key (defaults to the API demo key) and returns the key macro nutrients for the item. These macros are fat, protein, energy, cholesterol, and sugars.
    '''

    # create the url base
    requested_nutr_url = f'https://api.nal.usda.gov/fdc/v1/food/'

    # make call
    response = requests.get(f'{requested_nutr_url}{item_fdcid}?api_key={api_key}')

    # parse the returned json
    parsed_nutr = response.json()

    # select the nutrients you'd like
    nutrients = ['Total lipid (fat)', 'Protein', 'Energy' , 'Cholesterol', 'Sugars, total including NLEA']
    nutr_return = {'Total lipid (fat)': 0.0, 'Protein': 0.0, 'Energy': 0.0, 'Cholesterol': 0.0, 'Sugars, total including NLEA': 0.0}

    # get the serving size
    #nutr_return['Serving Size'] = parsed_nutr['servingSize']
    #nutr_return['Serving Size Unit'] = parsed_nutr['servingSizeUnit']

    # parse the JSON for the nutrients and append them to the dict
    for nutrient in nutrients:
        for nutr in parsed_nutr['foodNutrients']:
            if nutr['nutrient']['name'] == nutrient:
                nutr_return[nutrient] = nutr['amount']
        

    return (nutr_return)

In [58]:
def get_macro_total(ing_list, api_key = "DEMO_KEY"):

    # seperate the ingredients on the p tag
    ingredient_list = seperate_ingredients(ing_list)

    # for each ingredient split it using nlp
    ingredients = []
    for ingredient in ingredient_list:
            ingredients.append(split_ingredient(ingredient))

    #for each ingredient, get the nutrition information and add to total
    nutrition_totals = {'Total lipid (fat)': 0.0, 'Protein': 0.0, 'Calories': 0.0, 'Cholesterol': 0.0, 'Sugars, total including NLEA': 0.0}

    ingredients_not_included = []

    for ingredient_to_exam in ingredients:
        # set default values
        fat = 0
        protein = 0
        calories = 0
        cholesterol = 0
        sugar = 0
        macros = {}

        try:
            macros = get_macros(get_fdcid(ingredient_to_exam['ingredient'], api_key), api_key)
        except:
            ingredients_not_included.append(ingredient_to_exam)
        
        # use the macros that have a measurement in ml or g
        if any(macros):
            if (ingredient_to_exam['measurement'] == 'g' or ingredient_to_exam['measurement'] == 'ml'):
                quantity = float(ingredient_to_exam['quantity'])  
                fat = (macros['Total lipid (fat)'] * quantity)/100
                protein = (macros['Protein'] * quantity)/100
                calories = (macros['Energy'] * quantity)/100
                cholesterol = (macros['Cholesterol'] * quantity)/100
                sugar = (macros['Sugars, total including NLEA'] * quantity)/100
        
        nutrition_totals['Total lipid (fat)'] += round(fat,2)
        nutrition_totals['Protein'] += round(protein,2)
        nutrition_totals['Calories'] += round(calories,2)
        nutrition_totals['Cholesterol'] += round(cholesterol,2)
        nutrition_totals['Sugars, total including NLEA'] += round(sugar,2)
        
    # nutrition_totals['Unused Ingredients'] = str(ingredients_not_included)

    return(nutrition_totals)

In [59]:
# add new columns for the nutrients
df_headers = ['title', 'judge', 'makes', 'difficulty', 'hands_on_time', 'baking_time', 'ingredients', 'recipe_image', 'judge_image', 'calories', 'cholesterol', 'fat', 'protein', 'sugar']

recipes_df = recipes_df.reindex(columns = df_headers) 

In [67]:
# test for one recipe
nutr_test_dict = get_macro_total(recipes_df['ingredients'][68], API_KEY)

recipes_df.at[68,'calories'] = nutr_test_dict['Calories']
recipes_df.at[68,'cholesterol'] = nutr_test_dict['Cholesterol']
recipes_df.at[68,'fat'] = nutr_test_dict['Total lipid (fat)']
recipes_df.at[68,'protein'] = nutr_test_dict['Protein']
recipes_df.at[68,'sugar'] = nutr_test_dict['Sugars, total including NLEA']

In [55]:
recipes_df.at[55,'calories'] = nutr_test_dict['Calories']
recipes_df.at[55,'cholesterol'] = nutr_test_dict['Cholesterol']
recipes_df.at[55,'fat'] = nutr_test_dict['Total lipid (fat)']
recipes_df.at[55,'protein'] = nutr_test_dict['Protein']
recipes_df.at[55,'sugar'] = nutr_test_dict['Sugars, total including NLEA']
#recipes_df.at[55,'unused_ingredients'] = str(nutr_test_dict['Unused Ingredients'])

In [92]:
# iterate through the df and add values for the totals
for i in range(123,len(recipes_df.index)):
    nutr_dict = get_macro_total(recipes_df['ingredients'][i], API_KEY)
    recipes_df.at[i,'calories'] = nutr_dict['Calories']
    recipes_df.at[i,'cholesterol'] = nutr_dict['Cholesterol']
    recipes_df.at[i,'fat'] = nutr_dict['Total lipid (fat)']
    recipes_df.at[i,'protein'] = nutr_dict['Protein']
    recipes_df.at[i,'sugar'] = nutr_dict['Sugars, total including NLEA']
    

In [93]:
recipes_df.to_csv('nutr_test.csv')

In [71]:
# create base URL
search_request_url = 'https://api.nal.usda.gov/fdc/v1/search?api_key='

# make call
response = requests.get(f'{search_request_url}{API_KEY}&query=milk&datatype=Foundation')

# parse the returned JSON
parsed = response.json()

par

In [87]:
split_ing_list = seperate_ingredients(recipes_df['ingredients'][73])
for j in split_ing_list:
    split_test = split_ingredient(j)
    print(split_test)

{'quantity': 200.0, 'measurement': 'g', 'ingredient': 'plain flour', 'comment': 'na'}
{'quantity': 0.0, 'measurement': 'pinch', 'ingredient': 'salt', 'comment': 'na'}
{'quantity': 0.0, 'measurement': 'na', 'ingredient': 'na', 'comment': ' diced'}
{'quantity': 75.0, 'measurement': 'g', 'ingredient': 'caster sugar', 'comment': 'na'}
{'quantity': 1.0, 'measurement': 'tbsp', 'ingredient': 'vanilla paste', 'comment': 'na'}
{'quantity': 1.0, 'measurement': ' egg', 'ingredient': 'egg', 'comment': 'na'}
{'quantity': 0.0, 'measurement': 'na', 'ingredient': 'na', 'comment': ' melted'}


In [90]:
recipes_df['ingredients'][123]

'[<h5>For the dough:</h5>, <p>500g strong white bread flour</p>, <p>10g salt</p>, <p>7g sachet fast-action dried yeast</p>, <p>40g unsalted butter, softened</p>, <p>1 tablespoon malt extract</p>, <p>280ml milk, at room temperature</p>, <p>finely grated zest of 2 oranges</p>, <p>50g poppy seeds</p>]'