# Recipe Search

Proposed goal here is to create an API containing recipes, and be able to query it using ingredients. To get the data to create the API, I will be scraping content from websites that host recipes

In [130]:
import requests
from bs4 import BeautifulSoup
import json
import re

### AllRecipes

In [43]:
# get list of subcategories (brunch, lunch, dinner, etc)
base_url = 'https://www.allrecipes.com'

html = requests.get(base_url)
soup = BeautifulSoup(html.text, 'html.parser')

script_content = json.loads(soup.find('script', type='application/ld+json').text)
urls = script_content[1]['url'][0][:15]
urls

['https://www.allrecipes.com/recipes/78/breakfast-and-brunch/',
 'https://www.allrecipes.com/recipes/17561/lunch/',
 'https://www.allrecipes.com/recipes/17562/dinner/',
 'https://www.allrecipes.com/recipes/76/appetizers-and-snacks/',
 'https://www.allrecipes.com/recipes/156/bread/',
 'https://www.allrecipes.com/recipes/79/desserts/',
 'https://www.allrecipes.com/recipes/77/drinks/',
 'https://www.allrecipes.com/recipes/77/drinks/',
 'https://www.allrecipes.com/recipes/96/salad/',
 'https://www.allrecipes.com/recipes/81/side-dish/',
 'https://www.allrecipes.com/recipes/94/soups-stews-and-chili/',
 'https://www.allrecipes.com/recipes/1642/everyday-cooking/',
 'https://www.allrecipes.com/recipes/84/healthy-recipes/',
 'https://www.allrecipes.com/recipes/85/holidays-and-events/',
 'https://www.allrecipes.com/recipes/86/world-cuisine/']

In [83]:
# going through each subcategory to get the sub-sub-categories

def get_subcategories_urls(url):
    """
    Function to get the urls of the subcategories
    
    Input
    -----
    url - url of a category (brunch, drinks, etc)
    
    Returns
    --------
    subcats_urls - a list of urls of the subcategories
    """
    
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')

    subcats = soup.find('div', class_='recipeCarousel').find_all('a', href=True)
    subcats_urls = [item['href'] for item in subcats if item['href'] != '#']
    
    return subcats_urls

In [86]:
brbr_subs = get_subcategories_urls(urls[0])

In [92]:
url = brbr_subs[0]
url

'https://www.allrecipes.com/recipes/151/breakfast-and-brunch/pancakes/'

In [90]:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')

In [112]:
content = soup.find_all('div', class_='category-page-list-content')
first = content[0].find_all('a')
for i in range(0,len(first),3):
    print(first[i]['href'])

https://www.allrecipes.com/recipe/278180/greek-yogurt-blueberry-lemon-pancakes/
https://www.allrecipes.com/recipe/261190/matcha-pancakes/
https://www.allrecipes.com/recipe/20520/big-germans/
https://www.allrecipes.com/recipe/260554/hearty-country-hot-cakes/
https://www.allrecipes.com/recipe/45145/blue-cornmeal-pancakes/
https://www.allrecipes.com/recipe/234087/whole-wheat-pancakes-from-scratch/
https://www.allrecipes.com/recipe/282426/crispy-scallion-pancakes/
https://www.allrecipes.com/recipe/256738/low-carb-pancakes-or-crepes/
https://www.allrecipes.com/recipe/21014/good-old-fashioned-pancakes/
https://www.allrecipes.com/recipe/264561/scottish-oatcakes/
https://www.allrecipes.com/recipe/269272/sourdough-and-banana-pancakes/
https://www.allrecipes.com/recipe/255515/almond-flour-paleo-pancakes/


In [113]:
after = content[1].find_all('a')
for i in range(0,len(after),3):
    print(after[i]['href'])

https://www.allrecipes.com/recipe/260542/fluffy-flapjack-pancakes/
https://www.allrecipes.com/recipe/45396/easy-pancakes/
https://www.allrecipes.com/recipe/20334/banana-pancakes-i/
https://www.allrecipes.com/recipe/162760/fluffy-pancakes/
https://www.allrecipes.com/recipe/24530/buttermilk-pancakes-ii/
https://www.allrecipes.com/recipe/263880/bacon-okonomiyaki/
https://www.allrecipes.com/recipe/234702/quick-almond-flour-pancakes/
https://www.allrecipes.com/recipe/20177/todds-famous-blueberry-pancakes/
https://www.allrecipes.com/recipe/246952/authentic-swedish-pancakes/
https://www.allrecipes.com/recipe/191885/vegan-pancakes/
https://www.allrecipes.com/recipe/17036/pumpkin-pancakes/
https://www.allrecipes.com/recipe/220415/old-fashioned-pancakes/
https://www.allrecipes.com/recipe/36900/german-pancakes-ii/
https://www.allrecipes.com/recipe/216755/extra-yummy-fluffy-pancakes/
https://www.allrecipes.com/recipe/241553/buckwheat-pancakes/
https://www.allrecipes.com/recipe/23900/german-apple-p

In [120]:
load_more = content[1].find_all('a', class_='category-page-list-related-load-more-button')
if len(load_more) == 1:
    # go to next page

[<a class="category-page-list-related-load-more-button manual-link-behavior" href="https://www.allrecipes.com/recipes/151/breakfast-and-brunch/pancakes/?page=2" id="category-page-list-related-load-more-button">
                     Load More
                   </a>]

In [198]:
# function to get a json (dict) object from a recipe page, these will be stored in a list containing all json objs
page_url = 'https://www.allrecipes.com/recipe/278180/greek-yogurt-blueberry-lemon-pancakes/'

json_dict = dict()

html = requests.get(page_url)
soup = BeautifulSoup(html.text, 'html.parser')

# general info
json_dict['title'] = soup.find('h1', class_='headline').text
json_dict['rating'] = re.split(':', soup.find('span', class_='review-star-text').text)[1].strip()
json_dict['author'] = soup.find('a', class_='author-name').text
json_dict['url'] = page_url
json_dict['source'] = 'allrecipes'

# time and serving info
meta = dict()
for m in soup.find_all('div', class_='recipe-meta-item'):
    meta[m.find('div', class_='recipe-meta-item-header').text.lower().strip()[:-1]] = \
        m.find('div', class_='recipe-meta-item-body').text.strip()
json_dict['time_servings'] = meta

# nutritional info
nutrition = dict()
info = soup.find('div', class_='partial recipe-nutrition-section').find('div', class_='section-body').text.strip()
info = re.split('\n', info)[:-1][0].split(';')
info[-1] = info[-1][:-1]
nutrition['calories'] = info[0].split()[0]
for i in info[1:]:
    i = i.split()
    nutrition[i[0].strip()] = i[1]
json_dict['nutrition_per_serving'] = nutrition

# ingredients
ingredients = dict()
for item in soup.find('fieldset', class_='ingredients-section__fieldset').find_all('span', class_='ingredients-item-name'):
    print(item.text.strip())

1 ½ cups all-purpose flour
2 tablespoons baking powder
1 tablespoon white sugar
¼ teaspoon salt
1 cup vanilla-flavored almond milk
½ cup low-fat vanilla Greek yogurt (such as Cabot®)
1 large egg, lightly beaten
1  lemon, zested and juiced
1 teaspoon almond extract
1 tablespoon unsalted butter, melted
1 cup blueberries


In [197]:
# need to parse ingredients list which may be an annoying problem.. if this doesn't work i may use NLP


[<span class="ingredients-item-name">
                                                 1 ½ cups all-purpose flour 
                                             </span>,
 <span class="ingredients-item-name">
                                                 2 tablespoons baking powder 
                                             </span>,
 <span class="ingredients-item-name">
                                                 1 tablespoon white sugar 
                                             </span>,
 <span class="ingredients-item-name">
                                                 ¼ teaspoon salt 
                                             </span>,
 <span class="ingredients-item-name">
                                                 1 cup vanilla-flavored almond milk 
                                             </span>,
 <span class="ingredients-item-name">
                                                 ½ cup low-fat vanilla Greek yogurt (such as Cabot®) 
                   