In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests
import time
import datetime

Scraping the relevant information from any given NYT recipe, simply taking in a url.

Largely pretty straightforward. Only a couple of possible exceptions. For one, some recipes have things split between multiple sections: a recipe that has two seperate seeming components might list ingredients for component one and then put component two in a second div.

Another is that certain recipes, particularly drink recipes, might not have a cooking time. I put a try/except breaker in the final function to deal with this.

I ran into another couple of excpetions in the form of 'no recipe recipes' which didn't have ingredients lists or steps. What a nuisance! I'm only looking for things with a bare-minimum of usable info here, so I chose to ignore those all together.

In [2]:
session = requests.Session()

req = session.get('https://cooking.nytimes.com/recipes/1013062-cuban-black-beans?action=click&region=Sam%20Sifton%27s%20Suggestions&rank=1')
soup = BeautifulSoup(req.text, 'lxml')

In [42]:
#find title
soup.find('div', class_= 'title-container').get_text().strip()

'Cuban Black Beans'

In [43]:
#find time
soup.find('ul', class_= 'recipe-time-yield').find_all('li')[1].get_text().strip().split('\n')

['Time', '45 minutes']

In [51]:
#find star rating
int(soup.find('span', itemprop= "ratingValue").text)

5

In [66]:
#list of ingredients
items = []
for item in soup.find('ul', class_= 'recipe-ingredients').find_all('span', class_='ingredient-name'):
    items.append(item.text.strip())

items


['green peppers, stemmed and seeded',
 'garlic cloves',
 'pound dried black beans, rinsed and picked over to remove any stones',
 'smoked ham hock',
 'bay leaves',
 'teaspoons salt, or to taste',
 'cup olive oil',
 'slices thick bacon, cut into 1/2-inch pieces',
 'Spanish onion, diced',
 'jalapeño, stemmed and finely chopped',
 'teaspoon dried oregano',
 'teaspoon ground cumin',
 'teaspoon freshly ground black pepper',
 'tablespoons distilled white vinegar',
 'tablespoon turbinado or other brown sugar']

In [68]:
counts = []
for count in soup.find('ul', class_= 'recipe-ingredients').find_all('span', class_='quantity'):
    counts.append(count.text.strip())
    
counts

['1 ½', '10', '1', '1', '2', '5', '¼', '4', '1', '1', '1', '½', '½', '3', '1']

In [71]:
combined = []
for n in range(0,len(items)):
    combined.append(counts[n]+' '+items[n])
combined

['1 ½ green peppers, stemmed and seeded',
 '10 garlic cloves',
 '1 pound dried black beans, rinsed and picked over to remove any stones',
 '1 smoked ham hock',
 '2 bay leaves',
 '5 teaspoons salt, or to taste',
 '¼ cup olive oil',
 '4 slices thick bacon, cut into 1/2-inch pieces',
 '1 Spanish onion, diced',
 '1 jalapeño, stemmed and finely chopped',
 '1 teaspoon dried oregano',
 '½ teaspoon ground cumin',
 '½ teaspoon freshly ground black pepper',
 '3 tablespoons distilled white vinegar',
 '1 tablespoon turbinado or other brown sugar']

In [84]:
#cooking steps
steps = []

for step in soup.find('ol', class_= 'recipe-steps').find_all('li'):
    steps.append(step.text)

steps

['Cut 1 green pepper into 1-inch squares. Smash and peel 4 of the garlic cloves. Put the green pepper and garlic into a large pot with the beans, ham hock, bay leaves and 1 tablespoon salt. Add 2 quarts water and bring to a boil. Cover the pot and simmer until the beans are tender, an hour or more.',
 'Meanwhile, make a sofrito. Cut the remaining ½ green pepper into ¼-inch dice. Peel and finely chop the remaining garlic. Heat the olive oil in a very large skillet over medium-high heat. Add the bacon and cook, stirring occasionally, until it starts to brown, about 5 minutes. Add the green pepper and onion and cook, stirring, until slightly softened, about 3 minutes. Add the garlic, jalapeño (leave out the seeds if you don’t want it too spicy), oregano, cumin, black pepper and 2 teaspoons salt and stir for another minute. Pour in the vinegar and scrape any browned bits from bottom of pan with a wooden spoon. This is your sofrito.',
 'When the beans are cooked, discard the bay leaf. Remov

In [3]:
def recipe_compiler(url):
    req = session.get(url)
    soup = BeautifulSoup(req.text, 'lxml')
    
    title = soup.find('div', class_= 'title-container').get_text().strip()
    try:
        time = soup.find('ul', class_= 'recipe-time-yield').find_all('li')[1].get_text().strip().split('\n')[1]
    except:
        time = 'N/A'
    
    items = []
    for group in soup.find_all('ul', class_= 'recipe-ingredients'):
        for item in group.find_all('span', class_='ingredient-name'):
            items.append(item.text.strip())
            
    counts = []
    for group in soup.find_all('ul', class_= 'recipe-ingredients'):
        for item in group.find_all('span', class_='quantity'):
            counts.append(item.text.strip())
    
    combined = []
    for n in range(0,len(items)):
        combined.append(counts[n]+' '+items[n])
    
    steps = []

    for step in soup.find('ol', class_= 'recipe-steps').find_all('li'):
        steps.append(step.text)
    
    return {'recipe':title, 'cooking_time':time, 'item_list':combined,'instructions':steps,'url':url}

In [110]:
url = 'https://cooking.nytimes.com/recipes/1013062-cuban-black-beans?action=click&region=Sam%20Sifton%27s%20Suggestions&rank=1'
recipe_compiler(url)

{'recipe': 'Cuban Black Beans',
 'cooking_time': '45 minutes',
 'item_list': ['1 ½ green peppers, stemmed and seeded',
  '10 garlic cloves',
  '1 pound dried black beans, rinsed and picked over to remove any stones',
  '1 smoked ham hock',
  '2 bay leaves',
  '5 teaspoons salt, or to taste',
  '¼ cup olive oil',
  '4 slices thick bacon, cut into 1/2-inch pieces',
  '1 Spanish onion, diced',
  '1 jalapeño, stemmed and finely chopped',
  '1 teaspoon dried oregano',
  '½ teaspoon ground cumin',
  '½ teaspoon freshly ground black pepper',
  '3 tablespoons distilled white vinegar',
  '1 tablespoon turbinado or other brown sugar'],
 'instructions': ['Cut 1 green pepper into 1-inch squares. Smash and peel 4 of the garlic cloves. Put the green pepper and garlic into a large pot with the beans, ham hock, bay leaves and 1 tablespoon salt. Add 2 quarts water and bring to a boil. Cover the pot and simmer until the beans are tender, an hour or more.',
  'Meanwhile, make a sofrito. Cut the remaining

In [121]:
url = 'https://cooking.nytimes.com/recipes/1020851-bittersweet-brownie-shortbread?action=click&module=Global%20Search%20Recipe%20Card&pgType=search&rank=74'
pd.DataFrame([recipe_compiler(url)])

Unnamed: 0,cooking_time,instructions,item_list,recipe,url
0,"1 1/2 hours, plus cooling",[Heat oven to 350 degrees. Grease a 9-by-13-in...,[1 ½ cups/340 grams cold unsalted butter (3 st...,Bittersweet Brownie Shortbread,https://cooking.nytimes.com/recipes/1020851-bi...


Now to compile a list of urls. I do believe that this will be reasonably easy. An empty search returns results, and it seems like the urls for page 2 onward of the results follows a simple pattern, but we'll see.



In [95]:
url = 'https://cooking.nytimes.com/search?q=&page=1'
req = session.get(url)
soup = BeautifulSoup(req.text, 'lxml')


In [100]:
soup.find('article',class_='card recipe-card')['data-url']

'/recipes/1020933-mushroom-farro-soup-with-parmesan-broth'

In [103]:
# complete url starts with the https://cooking.nytimes.com
recipe_urls = []
for item in soup.find_all('article',class_='card recipe-card'):
    recipe_urls.append('https://cooking.nytimes.com'+item['data-url'])
recipe_urls

['https://cooking.nytimes.com/recipes/1020933-mushroom-farro-soup-with-parmesan-broth',
 'https://cooking.nytimes.com/recipes/1019943-easiest-lentil-soup',
 'https://cooking.nytimes.com/recipes/1019241-beans-and-garlic-toast-in-broth',
 'https://cooking.nytimes.com/recipes/1020934-parmesan-broth',
 'https://cooking.nytimes.com/recipes/1020928-potato-gratin-with-swiss-chard-and-sumac-onions',
 'https://cooking.nytimes.com/recipes/1020935-braised-fennel-with-white-bean-puree',
 'https://cooking.nytimes.com/recipes/1020905-cold-fashioned',
 'https://cooking.nytimes.com/recipes/1020936-nomad-espresso-martini',
 'https://cooking.nytimes.com/recipes/1020939-japanese-style-tuna-noodle-salad',
 'https://cooking.nytimes.com/recipes/1020907-toor-dal-split-yellow-pigeon-peas',
 'https://cooking.nytimes.com/recipes/1020914-meen-gassi-fish-curry',
 'https://cooking.nytimes.com/recipes/1020919-spatchcocked-chicken-with-herb-butter',
 'https://cooking.nytimes.com/recipes/1020921-scalloped-potato-grat

In [104]:
len(recipe_urls)

48

There are, strangely, 48 results per search page. There are in total more 20,000 results overall, so we can cover all of them in around 420 pages. I'll just try going through 400 since that's a nice even number of pages.

Hopefully the results order remains reasonably stable as I go through. After I compile the master list of urls, I'll have to go through it and weed out any duplicates.

In [6]:
master_recipe_urls = []
for n in range(1,400):
    url = f'https://cooking.nytimes.com/search?q=&page={n}'
    req = session.get(url)
    soup = BeautifulSoup(req.text, 'lxml')
    

    for item in soup.find_all('article',class_='card recipe-card'):
        master_recipe_urls.append('https://cooking.nytimes.com'+item['data-url'])
        
    #And let's sleep for a second in case trip something on the NYT's end



In [7]:
len(master_recipe_urls)

19152

In [108]:
len(set(master_recipe_urls))

19150

At this point, I threw everything into a .py file to run at the command line.

I hate web scraping and digging through html, but the NYT cooking site made it surprisingly easy for me. Thank you NYT! Was that done on purpose, was the site designed to make it easy to collect urls and automatically collect recipe information? I don't know, but I appreciate it nonetheless.

In [None]:
all_recipes = pd.DataFrame()
for url in master_recipe_urls:
    #This try patch to avoid 'no recipe recipes' with no ingredients or steps
    try:
        df = pd.DataFrame([recipe_compiler(url)])
        all_recipes = pd.concat([all_recipes, df])
    except:
        pass