## Pull Individual Recipes from NYT Cooking

### Import Packages and Search Result Data

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
from pprint import pprint
import extruct
from w3lib.html import get_base_url
from fake_useragent import UserAgent
import time, os
import pickle
import random

In [None]:
with open('all_recipes_first.pickle','rb') as read_file:
    recipe_links = pickle.load(read_file)

recipe_links_sorted = sorted(recipe_links)

### Function to pull each site and extract the individual variables

In [None]:
def get_recipe_dict(link):
    
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}

    base_url = 'https://cooking.nytimes.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url, headers = user_agent)
    html = response.text
    metadata = extruct.extract(
        html,
        base_url=get_base_url(html, url),
        syntaxes=['json-ld'],
      )['json-ld'][0]
    
    
    
    try:
        name = metadata['name']
    except (TypeError, KeyError, IndexError):
        name = None
        
    try:
        number_of_steps = len(metadata['recipeInstructions'])
    except (TypeError, KeyError, IndexError):
        number_of_steps = None
        
    try:
        number_of_ratings = metadata['aggregateRating']['ratingCount']
    except (TypeError, KeyError, IndexError):
        number_of_ratings = None
    
    try:
        rating_value = metadata['aggregateRating']['ratingValue']
    except (TypeError, KeyError, IndexError):   
        rating_value = None
    
    try:
        author = metadata['author']['name']
    except (TypeError, KeyError, IndexError):
        author = None
    
    try:
        recipe_time = metadata['totalTime']
    except (TypeError, KeyError, IndexError):
        recipe_time = None
        
    try:
        number_of_ingredients = len(metadata['recipeIngredient'])
    except (TypeError, KeyError, IndexError):
        number_of_ingredients = None
    
    try:
        number_of_servings = metadata['recipeYield']
    except (TypeError, KeyError, IndexError):
        number_of_servings = None
    
    try:
        recipe_categories = metadata['recipeCategory']
    except (TypeError, KeyError, IndexError):
        recipe_categories = None
    
    try:
        recipe_keywords = metadata['keywords']
    except (TypeError, KeyError, IndexError):
        recipe_keywords = None
    
    try:
        image_link = metadata['image']
    except (TypeError, KeyError, IndexError):
        image_link = None
    
    
    
    headers = ['name', 'number_of_steps', 'number_of_ratings', 'rating_value', 'author', 
               'recipe_time', 'number_of_ingredients', 'number_of_servings', 'recipe_categories',
               'recipe_keywords', 'image_link']


    recipe_dict = dict(zip(headers, [name, number_of_steps, number_of_ratings, rating_value, author, 
                            recipe_time, number_of_ingredients, number_of_servings, 
                            recipe_categories, recipe_keywords, image_link]))
    
    time.sleep(5+2*random.random())
    
    return recipe_dict


### Running the function through the list of recipe URLs

In [None]:
# I got kicked out of the site multiple times and ran into a couple errors so 
# this was run multiple times, adjusting the recipe_links_subset data source accordingly

recipe_info_list = []

recipe_links_subset = recipe_links_sorted[:]
for link in recipe_links_subset:
    recipe_info_list.append(get_recipe_dict(link))

len(recipe_info_list)

### Exporting data and checking that the export worked correctly

In [None]:
with open('html_data5.pickle', 'wb') as to_write:
        pickle.dump(recipe_info_list, to_write)

In [None]:
with open('html_data5.pickle','rb') as read_file:
    test = pickle.load(read_file)

len(test)