In [9]:
import pandas as pd
import requests
import requests_cache
import re
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import pickle
import json
import copy

%matplotlib inline
requests_cache.install_cache("cache5")

* Dietary concerns(special-consideration): 
    * Quick and easy
    * Healthy
    * Organic
    * Vegan
    * Vegetarian
* Ingredients (ingredient):
    * Beef
    * Chicken
    * Fish
    * Seafood
    * Vegetable
* Cuisine:
    * African
    * American
    * Asian
    * Central/ S. American
    * European
    * British
    * Caribbean
    * Indian

    
    
    

In [1]:
# Setting up categories to scrape

base_url = 'https://www.epicurious.com'

categories = {'special-consideration': ['quick-and-easy', 'healthy', 'organic', 'vegan', 'vegetarian'], 
                'ingredient': ['beef', 'chicken', 'fish', 'seafood', 'vegetable'],
               'cuisine': ['african', 'american', 'asian', 'central-south-american', 'european', 
                           'british', 'caribbean', 'indian']
             }

more_categories = {'meal':['side', 'dinner','lunch','breakfast'], 
                   'type':['salad','sandwich'], 'ingredient':['egg','rice','mushroom'],
                  'technique':['barbecue','no-cook','fry','saute']}


#Categories into list of dictionary for ease of scraping
param_filter = list()
for k,v in more_categories.items():
    for e in v:
        param_filter.append(dict([(k,e)]))


In [162]:

def get_url(param_dict, page_start = 1, page_max = 30, storage = rec_urls):
    
    '''
    Requests and finds the url for for each recipe listed on a page (18 results per page)
    param_dict: dictionary with key, val as param name:filter term, for Epicurious.com
    page_start and page_max: starting and ending page to scrape for each category
    storage: dictionary, holds title:url 
    '''
    page = page_start
    
    while True:
        url = f'{base_url}/search/'
        base_params = {'content':'recipe', 'sort':'highestRated', 'page':page}
        base_params.update(param_dict)
        
        req = requests.get(url, params=base_params)
        if req.status_code == 404:    #in case page does not exist
            break
        
        soup = bs(req.text, 'lxml')
        
        for tag in soup.find_all('a', class_='view-complete-item', itemprop='url'):
            if tag['title'] not in storage and tag['title'] not in list(recipes.title):
                storage[tag['title']] = f"{base_url}{tag['href']}"
                print(tag['title'], base_url + tag['href'], param_dict, req.from_cache)
                
        page += 1
        if page >= page_max:
            break
        


In [194]:
'''
Scraping urls and pickling
'''

# rec_urls = {}
# for d in param_filter:
#     get_url(d, page_start=30, page_max=100, storage=more_rec_urls)

# with open('even_more_recs.p', 'wb') as file:
#     pickle.dump(more_rec_urls, file, protocol=pickle.HIGHEST_PROTOCOL)

# for d in param_filter:
#     get_url(d, page_start=30, page_max=80)

# with open('some_more_recs.p', 'wb') as file:
#     pickle.dump(rec_urls, file, protocol=pickle.HIGHEST_PROTOCOL)

In [175]:
# with open('some_more_recs.p', 'rb') as file:
#     dt = pickle.load(file)

#Converting into DataFrame then to list of dictionary for ease of looping
df_temp = pd.DataFrame(list(dt.items()), columns=['title', 'link'])
rec_to_scrape = df_temp.to_dict('records')



In [None]:
# TESTING out scraping each attribute of a recipe

# url='https://www.epicurious.com/recipes/food/views/chicken-soup-with-caramelized-ginger'
# req=requests.get(url)
# soup = bs(req.text, 'lxml')


# nutri = soup.find('div', class_='nutrition content')
# nutri

# categories = [e.text for e in soup.find_all('dt', itemprop='recipeCategory')]
# date = soup.find('meta', itemprop='datePublished')['content']
# desc = soup.find('div', itemprop='description').p.text
# directions = [step.text.strip() for step in 
#               soup.find('div', class_='instructions').find_all('li', class_='preparation-step')]
# ingr = [i.text for i in
#        soup.find('div', class_='ingredients-info').find_all('li',class_='ingredient')]
# rating = float(soup.find('span', class_='rating').text.split('/')[0]) * 5 / 4


In [177]:
def get_info(rec_dict):
    
    url = rec_dict['link']
    
    req = requests.get(url)
    soup = bs(req.text, 'lxml')
    
    nutri = soup.find('div', class_='nutrition content')
    
    try: 
        calories = float(nutri.find('span', itemprop='calories').text)
    except AttributeError:
        return None
    try:
        fat = float(nutri.find('span', itemprop='fatContent').text.split()[0])
    except AttributeError:
        fat = None
    try:
        protein = float(nutri.find('span', itemprop='proteinContent').text.split()[0])
    except AttributeError:
        protein = None
    try:
        carb = float(nutri.find('span', itemprop='carbohydrateContent').text.split()[0])
    except AttributeError:
        carb = None
    try:
        sodium = float(nutri.find('span', itemprop='sodiumContent').text.split()[0])
    except AttributeError:
        sodium = None
    try:
        categories = [cat.text for cat in soup.find_all('dt', itemprop='recipeCategory')]
    except AttributeError:
        categories = None
    try:
        date = soup.find('meta', itemprop='datePublished')['content']
    except AttributeError:
        date = None
    try:
        desc = soup.find('div', itemprop='description').p.text
    except AttributeError:
        desc = None
    try:
        directions = [step.text.strip() for step in 
                  soup.find('div', class_='instructions').find_all('li', class_='preparation-step')]
    except AttributeError:
        directions = None
    try:
        ingredients = [i.text for i in
                    soup.find('div', class_='ingredients-info').find_all('li',class_='ingredient')]
    except AttributeError:
        ingredients = None
    try:
        rating = float(soup.find('span', class_='rating').text.split('/')[0]) * 5 / 4 #Scale to out of 5 rating
    except AttributeError:
        rating = None
 
        
    return (calories, fat, protein, carb, sodium, categories, date, desc, directions, ingredients, rating)
    

In [213]:
''' 
Scraping recipe info, add new key,val pair for each attribute of a recipe by
unpacking resulting tuple from get_info function.
'''

# for rec in rec_to_scrape:
#     res = get_info(rec)
#     print(rec_to_scrape.index(rec)) 
#     if res:
#         cal, fat, pro, carb, so, cat, date, desc, di, ingr, rating = get_info(rec)

#         rec['calories'] = cal
#         rec['fat'] = fat
#         rec['protein'] = pro 
#         rec['carb'] = carb
#         rec['sodium'] = so
#         rec['categories'] = cat
#         rec['date'] = date
#         rec['desc'] = desc
#         rec['directions'] = di
#         rec['ingredients'] = ingr
#         rec['rating'] = rating
        
    


In [218]:
# Drop problematic infinite values and missing data
# new_recs_final = [e for e in rec_to_scrape if len(e) == 13 and e['calories'] != float('inf')]
# 
# df = pd.DataFrame(new_recs_final)
# with open('new_recs_final_df.pkl', 'wb') as file:
#     pickle.dump(df, file, protocol=pickle.HIGHEST_PROTOCOL)

# df.to_json('new_recs_final.json')



In [11]:
def get_carb(term):
    
    page = 1
    while True:
        params = {'content':'recipe', 'sort':'relevance', 'page':page}
        search_url = f"{base_url}/search/{term.replace(' ', '%20')}"

        req = requests.get(search_url, params=params)
        if req.status_code == 404:
            return (None, None)
        soup = bs(req.text, 'lxml')

        tag = soup.find('a', string=re.compile(term, re.I))
        
        if tag:
            link = tag['href']
            req2 = requests.get(f'{base_url}{link}')
            soup2 = bs(req2.text, 'lxml')
            date = soup2.find('meta' , itemprop='datePublished')['content']
            try:
                carb = float(soup2.find('span', itemprop='carbohydrateContent').text.split()[0])
            except AttributeError:
                carb = None
            return (date, carb)
        else:
            page += 1
        if page == 3:
            print(f'Unsuccessful:{term}', req.from_cache, f'page{page}')
            return (None, None)
        

In [26]:
# clean_recs = pd.read_json('clean_recs.json', convert_dates=False)
# cr = clean_recs.to_dict('records')

'''Chunking the recipes list into 3 parts,
    ran simultaneously in 3 different notebooks
'''
# carb1 = copy.deepcopy(cr[:6000])
# carb2 = copy.deepcopy(cr[6000:10000])
# carb3 = copy.deepcopy(cr[10000:])

In [46]:
def loop_carb(rec_list):
    for d in rec_list:
        rec = d['title'].replace("'", "")
        date, carb = get_carb(rec)
        if date == d['date']:
            d['carb'] = carb
            print(f'{rec}, {carb}, {carb1.index(d)}')


# loop_carb(carb1)
# loop_carb(carb2)
# loop_carb(carb3)


In [None]:
with open('carb1.p', 'rb') as file:
    carb1 = pickle.load(file)

with open('carb2.p', 'rb') as file:
    carb2 = pickle.load(file)

with open('carb3.p', 'rb') as file:
    carb3 = pickle.load(file)

In [69]:
carb = carb1 + carb2 + carb3

In [99]:
# pd.DataFrame(carb).to_json('old_recs_final.json')