In [1]:
import os
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup, SoupStrainer

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys

import time

In [2]:
DATA_PATH = Path.cwd().parent / "runtime" / "data" / "train_features" / "train_features"
url = "https://www.veganrecipeclub.org.uk/recipes/"
SCROLL_PAUSE_TIME = 3

In [3]:
def parent(path_):
    return os.path.abspath(os.path.join(path_, os.pardir))

## Load all urls to recipes from the website

In [4]:
def get_html(url):  
    #get chrome driver path
    driver_path = os.path.join(parent(os.getcwd()), 'chromedriver', 'chromedriver.exe.')
    #create instance of Chrome driver
    driver = Chrome(executable_path=driver_path)
    driver.get(url)

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    #load page source
    html = driver.page_source
    #close driver
    driver.close()
    return html

In [5]:
def get_all_urls(html):
    urls = []
    for link in BeautifulSoup(html, parse_only=SoupStrainer('a')):
        if link.has_attr('href'):
            urls.append(link['href'])
    return urls

def get_recipes_urls(urls):
    #remove duplicate
    urls = list(set(urls))
    
    #remove url that are not recipes
    for url in urls:
        if(url.startswith("https://www.veganrecipeclub.org.uk/recipes/") == False):
           urls.remove(url)
    return urls

In [6]:
html = get_html(url)
all_urls = get_all_urls(html)
recipes_urls = get_recipes_urls(all_urls)

In [7]:
len(recipes_urls)

697

As of the 4th of october 2021 we have 691 recipes

## Display html of one page to see where to ge the info

In [13]:
page = requests.get(recipes_urls[1])
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en-GB">
 <head>
  <meta charset="utf-8"/>
  <title>
   Fig, Rosemary &amp; Olive Pizza (Gluten-free) - Vegan Recipe Club
  </title>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="yes" name="apple-mobile-web-app-capable"/>
  <meta content="black" name="apple-mobile-web-app-status-bar-style"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="This vegan-friendly recipe is ideal for your lunch-time, go-to meal: homemade, nutritious, works anywhere from at your desk, in the park, on the beach or at your table at home." name="description"/>
  <script type="text/javascript">
   function theChampLoadEvent(e){var t=window.onload;if(typeof window.onload!="function"){window.onload=e}else{window.onload=function(){t();e()}}}
  </script>
  <script type="text/javascript">
   var theChampDefaultLang = 'en_GB', theChampCloseIconPath = 'https://www.veganrecipeclub.org.uk/wp-content/plu

In [15]:
#Title
soup.find_all("h1")

[<h1 style="text-align: center; margin-top: 50px">
 	Fig, Rosemary &amp; Olive Pizza (Gluten-free)</h1>]

In [37]:
#Title
soup.find_all("main", {"role": "main"})[0].find_all("div", {"style": "width: 100%; display: block;"})[2]

<div style="width: 100%; display: block;">
<meta content="" itemprop="prepTime"/>
<p style="text-align: center; font-weight: bold;">Preparation time</p>
<p style="text-align: center;"> </p>
</div>

## Scrape every urls

In [8]:
recipes_urls[:5]

['https://www.veganrecipeclub.org.uk/recipes/greek-salad-dressing/',
 'https://www.veganrecipeclub.org.uk/recipes/fig-rosemary-olive-pizza-gluten-free/',
 'https://www.veganrecipeclub.org.uk/recipes/scones-savoury-or-sweet/',
 'https://www.veganrecipeclub.org.uk/recipes/fast-healthy-burgers/',
 'https://www.veganrecipeclub.org.uk/recipes/beetroot-quinoa-burgers/']