In [58]:
import lxml.html
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import re
import string
import warnings
import os
import urllib
warnings.filterwarnings('ignore')

In [9]:
def get_recipe_keys(page_numbers):
    recipe_keys = []
    for page_number in page_numbers:
        url = 'http://www.epicurious.com/search?content=recipe&page=%d&sort=highestRated' % page_number
        r = requests.get(url)
        all_tags = bs(r.content)
        recipies = np.array([x.get_text().strip() for x in all_tags.find_all("a")])[np.arange(1,105,6)]
        recipe_key = ["-".join(x.lower().split(" ")) for x in recipies]
        recipe_keys.append(recipe_key)
    return([item for sublist in recipe_keys for item in sublist])

In [8]:
def get_recipe_reviews(recipe):
    url = 'http://www.epicurious.com/recipes/food/reviews/%s' % recipe
    r = requests.get(url)
    all_tags = bs(r.content)
    tot_reviews = int(all_tags.find(class_ = "reviews-count").get_text())
    tot_review_pages = len(range(0,tot_reviews,10))
    review_page = 1
    review_list = []
    while(review_page <= tot_review_pages):
        #print(review_page, end="|")
        url = 'http://www.epicurious.com/recipes/food/reviews/%s?page=%s' % (recipe, review_page)
        r = requests.get(url)
        all_tags = bs(r.content)
        review_list.append([x.find("p").get_text().strip() for x in all_tags.find_all("div" ,class_ = "review-text")])
        review_page += 1
    return([item for sublist in review_list for item in sublist])


In [37]:
def get_recipe_stats(recipe_name):
    recipe_dict = {}
    recipe_dict['recipe_id'] = hash(recipe_name) % (10 ** 7)
    url = 'http://www.epicurious.com/recipes/food/views/%s' % recipe_name
    try:
        r = requests.get(url)
    except:
        return(np.nan)
    all_tags = bs(r.content)
    recipe_dict['name'] = all_tags.find("h1").get_text().strip()
    try:
        recipe_dict['cooking_stats'] = [int(re.findall(string = x.get_text(), pattern = "^\d+")[0]) for x in all_tags.find_all("dd")]
    except:
        pass
    
    nutritional_labels = [x.get_text().strip() for x in all_tags.find_all(class_ = "nutri-label")]
    nutritional_data = [x.get_text().strip() for x in all_tags.find_all(class_ = "nutri-data")]
    recipe_dict['nutrition'] = dict(zip(nutritional_labels, nutritional_data))
    recipe_dict['ingredients'] = [x.get_text().strip() for x in all_tags.find_all("li", class_ = 'ingredient')]
    recipe_dict['tags'] = [x.get_text().strip() for x in all_tags.find_all("dt")][3:]
    #For page reviews
    try:
        recipe_dict['reviews'] = get_recipe_reviews(recipe_name)
    except:
        pass
    
    return(recipe_dict)

In [10]:
recipe_names = get_recipe_keys(range(1,2))

In [38]:
recipes = [get_recipe_stats(x) for x in recipe_names]

In [39]:
recipes[2]

{'cooking_stats': [4, 45, 45],
 'ingredients': ['8 small bone-in, skin-on chicken thighs (about 3 pounds)',
  '2 teaspoons kosher salt, divided',
  '1 teaspoon freshly ground black pepper, divided',
  '1 pound asparagus, trimmed',
  '1 pound baby new potatoes, halved if larger than 1/2"',
  '1 bunch radishes (about 1/2 pound), halved',
  '2 tablespoons plus 2 teaspoons vegetable oil, divided',
  '3 garlic cloves, finely chopped',
  '1 tablespoon anchovy paste, or 6 fillets, finely chopped',
  '1/2 cup (1 stick) cold butter, cut into 1-tablespoon pieces, divided',
  '1/2 cup dry white wine',
  '1 tablespoon fresh lemon juice',
  '1 tablespoon finely chopped parsley, plus more for serving'],
 'name': 'Crispy Chicken Thighs with Spring Vegetables',
 'nutrition': {'Calories': '1006',
  'Carbohydrates': '29 g(10%)',
  'Cholesterol': '344 mg(115%)',
  'Fat': '69 g(107%)',
  'Fiber': '6 g(24%)',
  'Monounsaturated Fat': '31 g',
  'Polyunsaturated Fat': '14 g',
  'Protein': '62 g(125%)',
  'Sa