## PART 1

#### *Extracting all >35.000 recipe URLs from Epicurious*

### Creating Base URL List

In [1]:
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
import pickle

In [2]:
base_url = "https://www.epicurious.com/search?content=recipe&page=" 
urls = []

for i in range(1989): 
    urls.append(base_url+(str(i+1)))
urls[-5:]

['https://www.epicurious.com/search?content=recipe&page=1985',
 'https://www.epicurious.com/search?content=recipe&page=1986',
 'https://www.epicurious.com/search?content=recipe&page=1987',
 'https://www.epicurious.com/search?content=recipe&page=1988',
 'https://www.epicurious.com/search?content=recipe&page=1989']

In [3]:
with open('pickle/recipe_base_urls.pkl', 'wb') as f:
    pickle.dump(urls, f)

### Creating Core URL List

In [4]:
page_string = "https://www.epicurious.com/search?content=recipe&page=1985"
page = urlopen(page_string)
soup = bs(page, 'html.parser')

In [5]:
recipe_links = soup.find_all('a', {'class': 'view-complete-item'})
recipe_links[:4]

[<a class="view-complete-item" data-reactid="90" href="/recipes/food/views/shaved-fennel-and-apple-salad-104117" itemprop="url" title="Shaved Fennel and Apple Salad"><!-- react-text: 91 -->View “<!-- /react-text --><!-- react-text: 92 -->Shaved Fennel and Apple Salad<!-- /react-text --><!-- react-text: 93 -->”<!-- /react-text --></a>,
 <a class="view-complete-item" data-reactid="95" href="/recipes/food/views/shaved-fennel-and-apple-salad-104117">View Recipe</a>,
 <a class="view-complete-item" data-reactid="123" href="/recipes/food/views/cranberry-orange-drop-cookies-107524" itemprop="url" title="Cranberry-Orange Drop Cookies"><!-- react-text: 124 -->View “<!-- /react-text --><!-- react-text: 125 -->Cranberry-Orange Drop Cookies<!-- /react-text --><!-- react-text: 126 -->”<!-- /react-text --></a>,
 <a class="view-complete-item" data-reactid="128" href="/recipes/food/views/cranberry-orange-drop-cookies-107524">View Recipe</a>]

In [6]:
for link in recipe_links[0:len(recipe_links):2]: 
    print(link["href"])

/recipes/food/views/shaved-fennel-and-apple-salad-104117
/recipes/food/views/cranberry-orange-drop-cookies-107524
/recipes/food/views/whole-fish-baked-in-salt-104118
/recipes/food/views/upside-down-caramelized-apricot-tart-105058
/recipes/food/views/imperial-peach-sundaes-107892
/recipes/food/views/black-olive-clafoutis-106771
/recipes/food/views/herbed-lima-bean-hummus-103043
/recipes/food/views/saffron-orzo-with-asparagus-and-prosciutto-107938
/recipes/food/views/roasted-salted-pumpkin-seeds-103701
/recipes/food/views/festive-tuna-salad-106704
/recipes/food/views/xiao-jianmings-spareribs-with-chiles-107983
/recipes/food/views/fettucine-with-smoked-salmon-and-asparagus-103187
/recipes/food/views/mushroom-salad-with-endive-and-roquefort-cheese-105070
/recipes/food/views/gratin-of-endive-and-ham-105062
/recipes/food/views/broiled-sea-trout-with-basil-sauce-103815
/recipes/food/views/black-sesame-rice-105033
/recipes/food/views/watercress-radish-and-endive-salad-with-mustard-seed-vinaigr

### Putting all together

Mind the multiprocessing constraint of not having a shared list, see https://stackoverflow.com/questions/49418926/append-to-the-same-list-with-multiprocessing-python.

In [None]:
import multiprocessing as mp

base_urls = pickle.load(open("pickle/recipe_base_urls.pkl", "rb"))
base = "https://www.epicurious.com"

#Function
def crawl_pages(p, end_urls):
    
    #load page
    print('Loading page: ', p)
    page = urlopen(p)
    soup = bs(page, 'html.parser')

    #extract links
    recipe_links = soup.find_all('a', {'class': 'view-complete-item'})
    for link in recipe_links[0:len(recipe_links):2]: 
        end_urls.append(base+link["href"])

if __name__ == '__main__':

    pool = mp.Pool(processes=2)

    manager = mp.Manager()
    end_urls = manager.list()
    [pool.apply_async(crawl_pages, args=[p,end_urls]) for p in base_urls]
    
    pool.close()
    pool.join()
    
    with open('pickle/recipe_urls.pkl', 'wb') as f:
        pickle.dump(list(end_urls), f)

In [65]:
my_list = pickle.load(open("pickle/recipe_urls.pkl", "rb"))
print(len(my_list))

my_list.index("https://www.epicurious.com/recipes/food/views/penne-with-roasted-tomatoes-chicken-and-mushrooms-107495")

35778


22665

## PART 2

#### *Crawling the URLs for data* 
Testscript for crawler, further adjusted in *01_epicurious_crawler.py*. <br>Implentation in 16 processes on Google Cloud Ubuntu VM, runtime ~45min for all 35.776 recipes.   

In [62]:
############ IMPORTS ###############
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
import pickle
import json
import multiprocessing as mp
from collections import defaultdict as dd
from datetime import datetime as dt


############ RECIPE CLASS ###############

class Recipe: 
    
    #build recipe object from page
    def __init__(self, page, url):
        
        self.title = self.get_title(page)
        self.url = url
        self.categories = self.get_categories(page)
        self.desc = self.get_desc(page) 
        self.date = self.get_date(page)
        self.image_link = self.get_image_link(page)
        self.rating = self.get_rating(page)
        self.recomm_perc = self.get_recomm_perc(page)
        self.ingredients = self.get_ingredients(page)
        self.nutrients = self.get_nutrients(page)
        self.preparation = self.get_preparation(page)
        self.preparation_note = self.get_chef_note(page)
        self.servings = self.get_servings(page)
        
    #getters  
    def get_title(self, page): 
        return page.find('h1', {'itemprop': 'name'}).text.strip()
    
    def get_date(self, page):
        try:
            return page.find('meta', {'itemprop': 'datePublished'})['content']
        except:
            return None
    
    def get_image_link(self, page): 
        try: 
            return page.find('img', {'class': 'photo loaded'})['srcset']
        except: 
            return None
    
    def get_rating(self, page):
        try:
            return float(page.find('span', {'class': 'rating'}).text.split('/')[0])
        except:
            return None
    
    def get_recomm_perc(self, page):
        try:
            return float(page.find('div', {'class': 'prepare-again-rating'}).find('span').text[:-1])
        except:
            return None

    def get_desc(self, page):
        try:
            return page.find('div', {'itemprop': 'description'}).find('p').text.strip()
        except:
            return None

    def get_ingredients(self, page):
        return self.get_grouped_data(page, 'ingredient-group')
    
    def get_preparation(self, page):
        return self.get_grouped_data(page, 'preparation-group')
    
    def get_chef_note(self, page):
        try:
            return page.find('div', {'class': 'chef-notes-content'}).text.strip()
        except: 
            return None
    
    def get_servings(self,page): 
        try:
            return int(page.find_all('span',{'class':'per-serving'})[0].text.split(" ")[2][1:])
        except: 
            return None
    
    def get_nutrients(self,page):
        return self.get_nutridata(page)
            
    def get_categories(self, page):
        return self.get_category_data(page)
    
    
    #helpers
    def get_grouped_data(self, page, group):
        
        groups = page.find_all('li', {'class': group})
        
        if len(groups) == 1:
            return [i.text.strip() for i in groups[0].find_all('li')]

        else:  
            results = []
            for i,g in enumerate(groups):
                try: 
                    group_title = g.find('strong').text.strip(":")
                except: 
                    group_title = "Group {}".format(i+1)
                group_content = [i.text.strip() for i in g.find_all('li')]
                results.append({group.replace("-","_"): group_title, 
                                group+"-content".replace("-","_"): group_content})
            return results

        
    def get_nutridata(self, page): 
        
        nutri_data = page.find_all('span', {'class':'nutri-data'}) 
        
        if len(nutri_data) == 0: 
            return None
        else: 
            nutrients = ["calories", "carbohydrates", "fat", "protein", "saturated_fat", 
                         "sodium", "poly_fat","fiber",  "mono_fat", "cholesterol"]
            list_of_10 = [float(n.text.split(" ")[0]) if len(n.text) > 1 else None 
                          for n in nutri_data]
            return {k:v for k,v in zip(nutrients,list_of_10)}
        

    def get_category_data(self, page):
        
        category_dict = dd(list)
        
        try:
            for tag in page.find('dl', {'class': 'tags'}):
                category_dict["{}".format(tag["href"].split("/")[1])].append(tag.text)
            return category_dict
        except: 
            return None


############ MAIN FUNCTIONS ###############

#logs entry
def log_entry(url,e): 
    with open('logs/crawler_log.txt', 'a') as logs: 
        logs.write("{} - Error at URL {}: {}".format(str(dt.now(), url, e)))

#takes a list and outputs even chunks
def chunks(l, n):
    return [l[i:i+n] for i in range(0, len(l), n)]

#builds recipes
def build_recipes(job_id, end_urls_slice):
    
    slice_recipes = []
    for url in end_urls_slice:

        print('Working on recipe: {}'.format(url))
        try: 
            #process html
            page_html = urlopen(url)
            page = bs(page_html, 'html.parser')

            #build & append object 
            recipe = Recipe(page, url)
            slice_recipes.append(recipe.__dict__)
        
        except Exception as e: 
            log_entry(url, e)
            continue
    
    #dumps one list for every process 
    with open('data/recipe_urls.json', 'a') as f:
        json.dump(slice_recipes, f)
    

#handles multiprocessing 
def handle_jobs(data, job_number):
    total = len(data)
    chunk_size = int(total / job_number)
    slices = chunks(data, chunk_size)
    jobs = []
        
    for i, s in enumerate(slices):
        j = mp.Process(target=build_recipes, args=(i, s))
        jobs.append(j)
    for j in jobs:
        j.start()
    

#main
def main(): 
    end_urls = pickle.load(open("pickle/recipe_urls_test.pkl", "rb"))
    handle_jobs(end_urls[:10], 2)
    

################ MAIN ###################

if __name__ == '__main__':
    main()

Working on recipe: https://www.epicurious.com/recipes/food/views/crispy-scallop-salad
Working on recipe: https://www.epicurious.com/recipes/food/views/creamy-one-pot-pasta-with-sausage-and-squash
Working on recipe: https://www.epicurious.com/recipes/food/views/soy-glazed-chicken-with-broccoli
Working on recipe: https://www.epicurious.com/recipes/food/views/make-ahead-spanish-frittata
Working on recipe: https://www.epicurious.com/recipes/food/views/big-batch-rice
Working on recipe: https://www.epicurious.com/recipes/food/views/instant-pot-braised-lamb-with-white-beans-and-spinach
Working on recipe: https://www.epicurious.com/recipes/food/views/golden-noodles-with-chicken
Working on recipe: https://www.epicurious.com/recipes/food/views/winter-italian-chopped-salad
Working on recipe: https://www.epicurious.com/recipes/food/views/lentil-soup-with-wheat-berries-and-kale
Working on recipe: https://www.epicurious.com/recipes/food/views/almond-butter-and-banana-pancakes
