# Using AI and NLP to Improve Your Cooking

Michael Humkey, Conor Marsten

## Introduction

### Why?
Surely all cooking can be improved by a little constructive criticism. But how do we determine what is useful?

### How?
Given a recipe and list of reviews for the recipe, we will look for criticisms by performing POS  tagging and pulling out nouns to provide suggestions for improving the recipe

## Building the Dataset

First we are using the food2fork api to search for a recipe. For simplicity’s sake we are just using the first element returned in the list of query results. This recipe consists of a source URL, a list of ingredients, a title, and a publisher name among a handful of other fields. Then we are using BeautifulSoup to load in the source URL and scrape each review from the web page. 


In [100]:
import urllib
import urllib.request

In [101]:
url = "http://food2fork.com/api/search?"
args = {'key' : 'd8b2df92a9cb994f2009b2be8410c1a3', 'q' : 'duck', 'sort' : 'r'}
data = urllib.parse.urlencode(args)
req = urllib.request.Request(url+data, headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(req).read()

In [102]:
import json

In [103]:
response = json.loads(response)
recipes = response['recipes']

In [104]:
from bs4 import BeautifulSoup
import requests

In [105]:
def getReviews(recipes, i=0):

    recipe = recipes[i]
    url = "http://food2fork.com/api/get?"
    data = urllib.parse.urlencode({'key' : 'd8b2df92a9cb994f2009b2be8410c1a3', 'rId' : recipe['recipe_id']})
    req = urllib.request.Request(url+data, headers={'User-Agent': 'Mozilla/5.0'})
    recipe = json.loads(urllib.request.urlopen(req).read())['recipe']
    bs =  BeautifulSoup(requests.get(recipe['source_url']).content, 'html.parser')
    reviews = []

    if (recipe['publisher'] == '101 Cookbooks'): #1
        reviews = bs.find_all(attrs={'class': 'card-body'})

    elif (recipe['publisher'] == 'BBC Good Food'):
        reviews = bs.find_all(attrs={'class': 'field-item even'})

    elif (recipe['publisher'] == 'Closet Cooking' 
           or recipe['publisher'] == 'Eats Well With Others' 
           or recipe['publisher'] == "Lisa's Kitchen"
           or recipe['publisher'] == 'A Spicy Perspective'
           or recipe['publisher'] == 'Naturally Ella'
           or recipe['publisher'] == 'Pastry Affair'):
        reviews = bs.find_all(attrs={'class': 'comment-body'})

    elif (recipe['publisher'] == 'Food Republic'):
        reviews = bs.find_all(attrs={'class': '_5mdd'}) #uses facebook comments plugin

    elif (recipe['publisher'] == 'PBS Food'):
        reviews = bs.find_all(attrs={'class': 'post-message'})

    elif (recipe['publisher'] == 'Simply Recipes'
          or recipe['publisher'] == 'Homesick Texan'
          or recipe['publisher'] == 'Tasty Kitchen'):
        reviews = bs.find_all(attrs={'class': 'comment-text'})

    elif (recipe['publisher'] == 'Two Peas and Their Pod'
          or recipe['publisher'] == 'Cookie and Kate'
          or recipe['publisher'] == "Elana's Pantry"
          or recipe['publisher'] == 'My Baking Addiction'
          or recipe['publisher'] == 'Smitten Kitchen'
          or recipe['publisher'] == 'Vintage Mixer'
          or recipe['publisher'] == 'Cookin Canuck'
          or recipe['publisher'] == 'Healthy Delicious'
          or recipe['publisher'] == 'Steamy Kitchen'
          or recipe['publisher'] == "What's Gaby Cooking"
          or recipe['publisher'] == 'Bunky Cooks'
          or recipe['publisher'] == 'Serious Eats'):
        reviews = bs.find_all(attrs={'class': 'comment-content'})

    elif (recipe['publisher'] == 'All Recipes'):
        reviews = bs.find_all(attrs={'class' : 'review-detail__link'})
        revList = []
        for review in reviews:
            revList.append(str(review).split(' ')[2].replace('href=\"', '').replace('\"', ''))
        reviews = []
        for url in revList:
            bs =  BeautifulSoup(requests.get(url).content, 'html.parser')
            reviews.append(bs.find(itemprop="reviewBody"))

    elif (recipe['publisher'] == 'Big Girls Small Kitchen'
          or recipe['publisher'] =='Jamie Oliver'
          or recipe['publisher'] == 'The Pioneer Woman'):
        reviews = bs.find_all(attrs={'data-role': 'message'})

    elif (recipe['publisher'] == 'Framed Cooks'
          or recipe['publisher'] == 'Picky Palate'):
        reviews = bs.find_all(attrs={'class': 'comment even thread-even depth-1'})

    elif (recipe['publisher'] == 'Bon Appetit'):
        reviews = bs.find_all(attrs={'class': 'review-body'})

    elif (recipe['publisher'] == 'Epicurious'):
        reviews = bs.find_all(attrs={'class': 'review-text'})

    elif (recipe['publisher'] == 'Cookstr'):
        reviews = bs.find_all(attrs={'class': 'commentText'})

    elif (recipe['publisher'] == 'Panini Happy'):
        #careful, sloppy html
        reviews = bs.find_all(attrs={'class': 'format_text'})

    elif (recipe['publisher'] == 'Real Simple'):
        reviews = bs.find_all(attrs={'class': 'comment_txt'})
        
    elif (recipe['publisher'] == 'Chow'):
        reviews = bs.find_all(itemprop="comment")

    elif (recipe['publisher'] == 'Delishhh'):
        reviews = bs.find_all(attrs={'class': 'commentmeta'})

    elif (recipe['publisher'] == 'Food Network'):
        reviews = bs.find_all(attrs={'class': 'gig-comment-body'})    
    else:
        print(recipe['publisher'])
        return getReviews(recipes, i+1)
        
    return reviews, recipe

To keep bad data from being processed, we do a little bit of cleaning

In [106]:
reviews, recipe = getReviews(recipes)    
# if type(reviews[0]) is not str:
revs = [rv.get_text().replace('\n', ' ').replace('\r', '') for rv in reviews]
# else:
#     revs = [rv.replace('\n', ' ').replace('\r', '') for rv in reviews]

In [107]:
print(reviews[0])
print(revs[0])

IndexError: list index out of range

## Checking for foods

In [None]:
from nltk.corpus import wordnet as wn
food = wn.synset('food.n.02')

In [None]:
recipe['publisher']

In [None]:
recipe

In [None]:
import nltk
from nltk import word_tokenize, pos_tag
text = word_tokenize(reviews[0].get_text())
tagged_text = nltk.pos_tag(text)
print(tagged_text)

In [None]:
import sqlite3
conn = sqlite3.connect('usda.sql3')
excluded_words = ['recipe', 'powder', 'ground', 'sea']
out =[]

for rv in revs:
    text = word_tokenize(reviews[0].get_text())
    tagged_text = nltk.pos_tag(text)
    for tt in tagged_text:
        if tt[1] == 'NN' and tt[0] not in excluded_words:
            search = '%' + tt[0] + '%'
            c = conn.cursor()
            c.execute('SELECT id, short_desc FROM food WHERE short_desc LIKE ?', (search,))
            values = c.fetchall()
            if len(values) != 0:
                for sentence in rv.split('.'):
                    if tt[0] in sentence and sentence not in out:
                        out.append(sentence)
#             for row in values:
#                 print(tt, row[0], row[1])
for x in out:
    print(x)

## Performing sentiment analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia

In [None]:
analyzer = sia()
scored = []
for sent in out:
    res = analyzer.polarity_scores(sent)
    tup = (sent, res)
    scored.append(tup)

In [None]:
def getKey(tup):
    return tup[1]['compound']

scored = list(reversed(sorted(scored, key=getKey)))

In [None]:
print(scored)
recipe

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 
out = "<div><h1 style=\"text-align:center\">{title}</h1><br/><img style=\"margin:auto\" src=\"{img_url}\"><ul><h3>Ingredients</h3>{ingredients}</ul><ul><h3>Users enjoyed this recipe more with the following modifications:</h3>{suggestions}</ul><a href=\"{src}\">View full recipe here</a></div>"
ingredients = "".join("<li>"+item+"</li>" for item in recipe['ingredients'])
suggestions = "".join("<li>"+item[0]+"</li>" for item in scored)
HTML(out.format(title=recipe['title'], img_url=recipe['image_url'], ingredients=ingredients, suggestions=suggestions, src=recipe['source_url']))