# NLP and Nutition Information
This notebook is used to get nutrition information such as Fat, Calories, Sugar, Carbohydrates, and Protein.

In [21]:
# import dependencies
import pandas as pd
import numpy as np
import re
from re import search
from fractions import Fraction

# import dependencies
import requests
import json


In [22]:
# import the raw csv
recipes_df_raw = pd.read_csv('../static/data/technical_list.csv', encoding="utf-8")
recipes_df = recipes_df_raw.copy()
recipes_df_raw.head

<bound method NDFrame.head of                                                  title           judge  \
0                   Rav Gill’s Macaron Snack-Cessories    Ravneet Gill   
1    Liam Charles’s Biscuit Card Tower & Deck of Cards    Liam Charles   
2                 Rav Gill’s Cherry & Almond Deco Roll    Ravneet Gill   
3               Liam Charles’s Yorkshire Pudding Wraps    Liam Charles   
4          Rav Gill’s Churros with Banana Butterscotch    Ravneet Gill   
..                                                 ...             ...   
124                    Mary Berry’s Hazelnut Dacquoise  Paul Hollywood   
125                           Mary Berry’s Religieuses  Paul Hollywood   
126                  Paul Hollywood’s Apricot Couronne  Paul Hollywood   
127          Mary Berry’s Tuiles with Chocolate Mousse  Paul Hollywood   
128                      Mary Berry’s Charlotte Royale  Paul Hollywood   

            makes              difficulty hands_on_time baking_time  \
0         

In [27]:
# Formats for ingredients
# 1g unsalted butter
# 1 tsp vanilla extract
# slivered pistachios
# 2 large eggs
# 2 dill pickles, quartered lengthways
# juice of 1/2 lemon

def split_ingredient(ingredient):
    '''
    The split_ingredients funcion takes an ingredient in a string format i.e. 10g sugar. The function then splits the ingredient into quantity, measurement, and any comments like temperature, or preparation.
    '''
    # create variable defaults
    comment = 'na'
    quantity = '0'
    measurement = 'na'
    item = 'na'

    # if the ingredient contains a comment
    if ',' in ingredient:
        comment = ingredient.split(',')[1]
        ingredient = ingredient.split(',')[0]

    # if measurement is weight or volume (works for g or ml)
    if re.match('^\d+[a-zA-Z]',ingredient) is not None:
        quantity = re.findall('\d+', ingredient)[0]
        measurement = re.findall('^\d+([a-zA-Z]+) ', ingredient)[0]
        item = re.findall('\d+[a-zA-Z]+ (.*)', ingredient)[0]

    # if ingredient is eggs
    elif 'egg' in ingredient:
        quantity = re.findall('\d+', ingredient)[0]
        measurement = re.findall('\d+(.*) ', ingredient)[0]
        item = 'egg'

    # Juice of
    elif 'juice of' in ingredient:
        item = re.findall('juice of [^a-z]+([a-z ]+)', ingredient, re.IGNORECASE)[0]
        if 'zest' in ingredient:
            comment = 'juice and zest'
        else:
            comment = 'juice'
        if re.findall('[0-9]+', ingredient) is not None:
            quantity = re.findall('[0-9]+', ingredient)[0]
        else:
            quantity = '1'


    # if ingredient uses tsp or tbsp
    elif 'tbsp' in ingredient: 
        try:
            quantity = re.findall('^([0-9]+/*[0-9]*)', ingredient)[0]
        except IndexError:
            quantity = "0.5"
        measurement = 'tbsp'
        item = re.findall('tbsp (.*)', ingredient)[0]

    elif 'tsp' in ingredient:
        try:
            quantity = re.findall('^([0-9]+/*[0-9]*)', ingredient)[0]
        except IndexError:
            quantity = "0.5"
        measurement = 'tsp'
        item = re.findall('tsp (.*)', ingredient)[0]
    
    # if it just a number and ingredient (2 pickles)
    elif re.match('^[0-9]+/*[0-9]* ', ingredient) is not None:
        quantity = re.findall('^([0-9]+/*[0-9]*)', ingredient)[0]
        measurement = "whole"
        item = re.findall('^[0-9]+/*[0-9]* (.*)', ingredient)[0]
    
    # a pinch of something
    elif search('pinch', ingredient) is not None:
        measurement = re.findall('(.*pinch)', ingredient)[0]
        item = re.findall('.*pinch of (.*)', ingredient)[0]
        if re.match('^[0-9]', ingredient) is not None:
            quantity = re.findall('^([0-9])', ingredient)[0]

    # Other
    else:
        comment = "Could not parse ingredient"
        # format into dict
        ingredient_list_split = {'quantity':quantity, 'measurement':measurement, 'ingredient':ingredient,'comment':item}
        return ingredient_list_split

    # make sure the quantity is the right data type
    if '/' in quantity:
        quantity = Fraction(quantity)
    
    # format into dict
    ingredient_list_split = {'quantity':float(quantity), 'measurement':measurement, 'ingredient':item, 'comment':comment}
    
    return ingredient_list_split

In [24]:
def seperate_ingredients(ingredient_string):
    '''
    The seperate_ingredients funcion takes an ingredient list with each ingredient inside a <p></p> tag and returns a list containing all the ingredients in a string format.
    '''
    ingredient_list = re.findall('<p>(.*?)</p>', ingredient_string)
    
    return ingredient_list

In [25]:
# test of the split ingredient function
test_ingredient = '1/2 lime'
test_ingredient_split = (split_ingredient(test_ingredient))
test_item = test_ingredient_split['ingredient']
print(test_ingredient_split)

{'quantity': 0.5, 'measurement': 'whole', 'ingredient': '1/2 lime', 'comment': 'na'}


In [60]:
# dict to hold totals
total_ingredients = {}

# include only the judges from the show
judge_list = ['Mary Berry', 'Paul Hollywood', 'Prue Leith']
tech_rec_df = recipes_df[recipes_df['judge'].isin(judge_list)]
tech_rec_df.reset_index(inplace=True)

# iterate through df
for rec_num in range(0,len(tech_rec_df.index)):


    # list to hold the output of the split
    split_ing_list = []

    # split the ingredient list
    split_ing_list = seperate_ingredients(tech_rec_df['ingredients'][rec_num])

    nlp_ingredients = []
    # for each ingredient split it using nlp
    for ingredient in split_ing_list:
        try:
            nlp_ingredients.append(split_ingredient(ingredient))
        except:
            print(f'Couldn\'t parse {ingredient}')
    
    # check if it has been added to ing_list
    for ing in nlp_ingredients:

        # if it has, add the quantity to quantity and increment the count
        if ing['ingredient'] in total_ingredients:
            try:
                quantity = ing['quantity']
                total_ingredients[ing['ingredient']]['quantity'] += quantity
                total_ingredients[ing['ingredient']]['count'] += 1
            except:
                print(f'Could not parse {ing}')
        # if it hasn't, create a new key in the dict
        else: 
            new_ing_dict = {'quantity': ing['quantity'], 'measurement':ing['measurement'], 'count':1,}
            total_ingredients[ing['ingredient']] = new_ing_dict

print(total_ingredients)


t': 3}, 'currants': {'quantity': 250.0, 'measurement': 'g', 'count': 2}, 'dried cranberries ': {'quantity': 100.0, 'measurement': 'g', 'count': 1}, 'mixed peel ': {'quantity': 50.0, 'measurement': 'g', 'count': 1}, 'Juice and finely grated zest of 1 large unwaxed orange': {'quantity': '0', 'measurement': 'na', 'count': 1}, 'brandy': {'quantity': 100.0, 'measurement': 'ml', 'count': 1}, 'molasses': {'quantity': 77.0, 'measurement': 'g', 'count': 2}, 'bicarbonate of soda': {'quantity': 3.0, 'measurement': 'tsp', 'count': 4}, 'walnuts': {'quantity': 160.0, 'measurement': 'g', 'count': 2}, 'unsalted butter ': {'quantity': 55.0, 'measurement': 'g', 'count': 1}, 'icing sugar ': {'quantity': 40.0, 'measurement': 'g', 'count': 1}, 'egg': {'quantity': 169.0, 'measurement': ' egg yolk', 'count': 68}, 'ground almonds': {'quantity': 254.0, 'measurement': 'g', 'count': 2}, 'icing sugar': {'quantity': 1124.0, 'measurement': 'g', 'count': 7}, 'almond extract ': {'quantity': 1.5, 'measurement': 'tsp',

In [65]:
#create data frame for the answer
ing_totals_df = pd.DataFrame()

#loop through the dict and add the values to a dict which is then appended to the df
for key in total_ingredients:
    item = key
    quantity = total_ingredients[key]['quantity']
    count = total_ingredients[key]['count']
    measurement = total_ingredients[key]['measurement']
    item_dict = {'item': item, 'quantity':quantity, 'count':count, 'measurement':measurement}
    ing_totals_df = ing_totals_df.append(item_dict, ignore_index=True)

In [66]:
# set the dtypes
ing_totals_df = ing_totals_df.astype({'quantity': 'int32', 'count':'int32', 'item':'string'})
# sort the values in decending order by total quantity
ing_totals_df.sort_values(by=['quantity'], ascending=False)


Unnamed: 0,count,item,measurement,quantity
0,39,plain flour,g,6345
32,40,caster sugar,g,4535
28,14,strong white bread flour,g,4030
29,45,unsalted butter,g,3645
3,11,whole milk,tbsp,2246
...,...,...,...,...
166,1,instant espresso powder,tsp,0
158,1,small bunch of coriander,na,0
154,1,crystallised rose petals,na,0
148,1,8 gelatine leaves,na,0


In [67]:
# sort the values in decending order by count
ing_totals_df.sort_values(by=['count'], ascending=False)

Unnamed: 0,count,item,measurement,quantity
19,68,egg,egg yolk,169
29,45,unsalted butter,g,3645
32,40,caster sugar,g,4535
0,39,plain flour,g,6345
36,33,salt,tsp,57
...,...,...,...,...
120,1,mixed spice,tsp,1
121,1,cider vinegar,ml,175
127,1,wholemeal flour,g,125
129,1,cold water,ml,160


In [73]:
# multiply by 6 to include the number of participants in the total
ing_totals_df['quantity * 6'] = ing_totals_df['quantity']*6
ing_totals_df['quantity * 6(lbs)'] = ing_totals_df['quantity * 6']*2.20462
# sort the values in decending order by count
ing_totals_df.sort_values(by=['count'], ascending=False)

Unnamed: 0,count,item,measurement,quantity,quantity * 6,quantity * 6(lbs)
19,68,egg,egg yolk,169,1014,2235.48468
29,45,unsalted butter,g,3645,21870,48215.03940
32,40,caster sugar,g,4535,27210,59987.71020
0,39,plain flour,g,6345,38070,83929.88340
36,33,salt,tsp,57,342,753.98004
...,...,...,...,...,...,...
120,1,mixed spice,tsp,1,6,13.22772
121,1,cider vinegar,ml,175,1050,2314.85100
127,1,wholemeal flour,g,125,750,1653.46500
129,1,cold water,ml,160,960,2116.43520


In [71]:
total_ingredients['salted butter']

{'quantity': 265.0, 'measurement': 'g', 'count': 2}