In [47]:
import logging 
import os.path
import pickle
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
from selenium import webdriver 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException



In [54]:
class Recipe():
    
    def __init__(self, name, url, ingredients, servings):
        self.name = name
        self.url = url
        self.ingredients = ingredients
        
    def __str__(self):
        return(self.name)
    
    
def get_recipes(driver, link):
        next_page_exists = True
        
        print(f'Getting recipes from {link}')
        logging.info(f'Getting recipes from {link}')
        driver.get(link)
    
        # set maximum time to load the web page in seconds
        driver.implicitly_wait(10)

        while next_page_exists:
            # collect data that are within the id 'recipes-page'
            recipes_page = driver.find_element(By.ID, 'recipes-page')
            # Get all the recipe links
            recipes = recipes_page.find_elements(By.CLASS_NAME, 'c-recipe-grid__item')

            for recipe in recipes:
                name = recipe.find_element(By.TAG_NAME, 'span').text
                url = recipe.get_attribute('href')
                all_recipes_dict[name] = {'url': url, 'ingredients': []}

            try: 
                # Get "load more" button to get to the next page
                button = recipes_page.find_element(By.CLASS_NAME, 'c-button')
                
            except NoSuchElementException:
                # We reached the last page, the "load more" button does not exist
                logging.info(f'Reached the last page of {link}')
                next_page_exists = False
                continue

            else:
                driver.execute_script("arguments[0].click();", button)
                time.sleep(5) 

In [55]:
# URL of the website to scrape
links = [
    'https://ottolenghi.co.uk//pages/mains-recipes',
    'https://ottolenghi.co.uk/pages/sides-recipes',
    'https://ottolenghi.co.uk/pages/soup-recipes',
    'https://ottolenghi.co.uk/pages/salad-recipes',
    ]

pkl_file = f'../data/interim/ottolenghi_recipes.pkl'

# If the pickle file exists, we do not get the data from the website again
if os.path.isfile(pkl_file):
    logging.info('Data has already been downloaded')
    logging.info('Downloading pickle file')
    with open(pkl_file, 'rb') as f:
        all_recipes_dict = pickle.load(f)
# If not, get the data from the website
else: 
    all_recipes_dict = {}

    # Installing webdriver
    logging.info('Installing Chrome webdriver')  
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    # load the web page
    for link in links: 
        get_recipes(driver, link)

    driver.quit()
    with open(pkl_file, 'wb') as fp:
        pickle.dump(all_recipes_dict, fp)

Getting recipes from https://ottolenghi.co.uk/pages/salad-recipes


In [61]:
first_recipe_url = all_recipes_dict[first_recipe := list(all_recipes_dict.keys())[0]]
last_recipe_url = all_recipes_dict[first_recipe := list(all_recipes_dict.keys())[-1]]
first_recipe_url, last_recipe_url

({'url': 'https://ottolenghi.co.uk/pages/recipes/jarred-artichoke-salad-almond-tarator',
  'ingredients': []},
 {'url': 'https://ottolenghi.co.uk/pages/recipes/bittersweet-salad',
  'ingredients': []})

In [28]:
page = requests.get(first_recipe_url)
soup = BeautifulSoup(page.text, 'html.parser')

ingredients = soup.select('tr')
print(ingredients)

[<tr>
<td class="c-recipe-ingredients__measurement">3 </td>
<td> onions (540g), peeled, 1 roughly chopped and the other 2 each cut into 6 wedges</td>
</tr>, <tr>
<td class="c-recipe-ingredients__measurement">
6 </td>
<td> garlic cloves, peeled and roughly chopped</td>
</tr>, <tr>
<td class="c-recipe-ingredients__measurement">
30g </td>
<td> fresh ginger, peeled and roughly chopped</td>
</tr>, <tr>
<td class="c-recipe-ingredients__measurement">
1½ tsp </td>
<td> ground cinnamon</td>
</tr>, <tr>
<td class="c-recipe-ingredients__measurement">
2 tsp </td>
<td> ground coriander</td>
</tr>, <tr>
<td class="c-recipe-ingredients__measurement">
½ tsp </td>
<td> ground turmeric</td>
</tr>, <tr>
<td class="c-recipe-ingredients__measurement">
10g </td>
<td> dill, roughly chopped</td>
</tr>, <tr>
<td class="c-recipe-ingredients__measurement">
10g </td>
<td> parsley, roughly chopped</td>
</tr>, <tr>
<td class="c-recipe-ingredients__measurement">
45ml </td>
<td> olive oil</td>
</tr>, <tr>
<td class="