# Epicurious - Schema

This notebook performs webscraping on the epicurious website to collect

- AuthorName
- Recipe Ingredients
- Recipe Instructions

and information regarding the

- Recipe Title
- Recipe Link
- Average Rating of the Recipe
- Total Number of Reviews for the Recipe 

has been collected while scraping the epicurious website's top rated recipe links.
This information is retrieved from the *epicuriousTopRatedRecipeLinks.csv* file and is appended in the *get_allRecipeInformation(recipeIndex)* method.

In [None]:
import re
import time
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup

#### Author Name

In [None]:
def get_authorName(doc):
    try:
        authorSection_tag = doc.find_all("div", class_="SplitScreenContentHeaderTitleBlock-sCMXE jActCi")[0]
        authorName_tag = authorSection_tag.find_all("div")[2]
        authorName = authorName_tag.find("a").text
    except Exception as e:
        authorName = ''
        pass
    return authorName

#### Recipe Ingredients

In [None]:
def get_recipeIngredients(doc):
    ingredients = []
    try: 
        recipe_content_tag = doc.find_all("div", class_="recipe__main-content")[0]
        ingredientList_tag = recipe_content_tag.find_all("div", attrs={'data-testid': 'IngredientList'})[0]
        recipeServings = ingredientList_tag.find("p").text.strip()
        ingredientList= ingredientList_tag.find("div", class_="List-iSNGTT ljAYJm")

        ingredients = [recipeServings]

        for ingredient in ingredientList:
            ingredients.append(ingredient.text.strip())
    except Exception as e:
        pass

    return ingredients

#### Recipe Instructions

NOTE: Initialise instructions before assignment - it will throw error at times

In [None]:
def get_recipeInstructions(doc):
    instructions = []
    try:
        recipe_content_tag = doc.find_all("div", class_="recipe__main-content")[0]
        instructions_tag = doc.find_all("div", class_="InstructionsWrapper-hZXqPx RmryN")[0]
        instructionLists = instructions_tag.find_all("li", class_="InstructionListWrapper-dcpygI kinFAs")
        
        for instructionList in instructionLists:
            for instruction in instructionList:
                instructions.append(instruction.text)
    except Exception as e:
        pass
    return instructions

#### Gather all the information of a recipe

In [None]:
def get_allRecipeInformation(recipeIndex):
    recipeTitle = epicuriousRecipeLinks_df.loc[recipeIndex, 'Recipe Title']
    recipeLink = epicuriousRecipeLinks_df.loc[recipeIndex, 'Recipe Link']
    recipeAverageRating = epicuriousRecipeLinks_df.loc[recipeIndex, 'Recipe Average Rating']
    recipeRatingCount = epicuriousRecipeLinks_df.loc[recipeIndex, 'Recipe Rating Count']
    recipeIngredients = get_recipeIngredients(doc)
    recipeSteps = get_recipeInstructions(doc)
    recipeAuthor = get_authorName(doc)
    
    return ((recipeTitle, recipeLink, recipeAverageRating, recipeRatingCount, recipeIngredients, recipeSteps, recipeAuthor))

In [None]:
columns = ['Recipe Title', 'Recipe Link', 'Recipe Average Rating', 'Recipe Rating Count', 'Ingredient (s)', 'Step (s)', 'Author']
epicuriousTopRatedRecipeLinks_df = pd.read_csv('./epicuriousTopRatedRecipeLinks.csv')

In [None]:
start_time = time.time()

recipeDetails = []

count = 0

for recipeIndex, recipeLink in enumerate(epicuriousTopRatedRecipeLinks_df['Recipe Link']):
    
    url = recipeLink
    
    response = requests.get(url)
    html_content = response.content
    doc = BeautifulSoup(html_content, 'html.parser')
    
# #     suspend execution for 1 second
#     time.sleep(1)

    try:
        recipeDetails.append(get_allRecipeInformation(recipeIndex))
        
        count += 1
        print(f'Number of Recipes Scraped = {count}, Number of Recipe Links processed = {recipeIndex}')
        
        if((count % 25) == 0):
            epicuriousTopRatedRecipes_df = pd.DataFrame(recipeDetails, columns=columns)
            epicuriousTopRatedRecipes_df.to_csv('./epicuriousTopRatedRecipes.csv', encoding='utf-8', quoting=csv.QUOTE_ALL, index = None)
        else:
            continue
            
    except Exception as e:
        print('Exception is - ', e)
        pass
    
end_time = time.time()
total_time = end_time - start_time

print(f"Total Time taken = {total_time} seconds")

In [None]:
epicuriousTopRatedRecipes_df = pd.DataFrame(recipeDetails, columns=columns)
epicuriousTopRatedRecipes_df.to_csv('./epicuriousTopRatedRecipes.csv', encoding='utf-8', quoting=csv.QUOTE_ALL, index = None)