# Script to scrape RHS 'Find a Plant' data

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from datetime import date
import json
import pandas as pd

In [2]:
options = Options()
options.headless = True
DRIVER_PATH = './chromedriver_win32/chromedriver.exe'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

In [3]:
def check_exists_by_xpath(driver, xpath):
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True

In [4]:
# Query RHS 'Find a Plant'

In [5]:
def run_query(query='camellia&isAgm=true'):
    driver.get('https://www.rhs.org.uk/plants/search-results-beta?query=camellia&isAgm=true')
    print('Title of webpage is: ' + str(driver.title))
    print('URL of webpage is: ' + str(driver.current_url))
    

In [6]:
driver.get('https://www.rhs.org.uk/plants/search-results-beta?query=camellia&isAgm=true')
print('Title of webpage is: ' + str(driver.title))
print('URL of webpage is: ' + str(driver.current_url))

Title of webpage is: Help, advice & tips from the RHS on all kinds of plants / RHS Gardening
URL of webpage is: https://www.rhs.org.uk/plants/beta-optin?return-url=plants%2Fsearch-results-beta&query=camellia&isAgm=true


In [7]:
# May need to opt-in to beta

In [8]:
if driver.current_url.find('beta-optin') > 0:
    optin_button = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,'//button[@class="button button--ghost button--small button--w-100 button--w-auto-sm u-m-y-0"]/span[@class="button__text"][text()="Try the new version"]')))
    # May also need to close popup
    if check_exists_by_xpath(driver,'//span[@id="popupCloseTH"]'):
        close_button = driver.find_element_by_xpath('//span[@id="popupCloseTH"]')
        if close_button.is_displayed():
            close_button.click() 
    # Select to opt-in
    optin_button.click()
#    optin_button = driver.find_element_by_xpath('//button[@class="button button--ghost button--small button--w-100 button--w-auto-sm u-m-y-0"]/span[@class="button__text"][text()="Try the new version"]')
print('Title of webpage is: ' + str(driver.title))
print('URL of webpage is: ' + str(driver.current_url))

Title of webpage is: Help, advice & tips from the RHS on all kinds of plants / RHS Gardening
URL of webpage is: https://www.rhs.org.uk/plants/search-results-beta?query=camellia&isAgm=true


In [9]:
# Simulate scrolling down to bottom of the page to display all results

In [10]:
time.sleep(2)
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
i = 1

while True:
    # scroll one screen height each time
    driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
    i += 1
    time.sleep(scroll_pause_time)
    # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")  
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if (screen_height) * i > scroll_height:
        break 

In [11]:
# Now extarct the list of plants returned by the search

In [12]:
soup = BeautifulSoup(driver.page_source, "html.parser")

In [13]:
plant_list = soup.find("ul", {"class": "gl-view js-gl-view"})

In [14]:
plant_list = plant_list.find_all("app-plants-search-list-item", {"class": "gl-view__item"})

In [15]:
print('The length of the list of search results is: ' + str(len(plant_list)))

The length of the list of search results is: 99


In [286]:
# Now extract the specific items of information about each plant

In [23]:
from collections import defaultdict

In [24]:
today = date.today().strftime("%d-%b-%Y")

In [25]:
plants = defaultdict(dict)
today = date.today()
for i, p in enumerate(plant_list):
    plant_title_elements = p.find("div", {"class": "gl-view__content__item-1"})
    plants[i]['img_src'] = p.find("img", {"class": "gl-view__image"})['src']
    plants[i]['botanical_name'] = plant_title_elements.find("h4", {"class": "gl-view__title u-m-b-0"}).text
    plants[i]['common_name'] = plant_title_elements.find("h4", {"class": "gl-view__title text-normal"}).text
    plants[i]['brief_desc'] = p.find("div", {"class": "gl-view__content__item-2"}).find("p").text
    plants[i]['detail_page'] = p.find("a", {"class": "u-faux-block-link__overlay"})['href']
    plants[i]['rhs_id'] = plants[i]['detail_page'].split('/')[2]
    plants[i]['query_date'] = today
    if p.find("i", {"title":"AGM plant"}) is None:
        plants[i]['agm_plant'] = 0
    else:
        plants[i]['agm_plant'] = 1
    supplier_search_elements = p.find("div", {"class": "gl-view__content__item-3"}).findChildren('a')
    
    if len(supplier_search_elements) == 1:
        plants[i]['num_suppliers'] = supplier_search_elements[0].find("span").text.split()[0]
        plants[i]['supplier_search'] = supplier_search_elements[0]['href']
        plants[i]['rhsplants_url'] = ''
        plants[i]['rhsplants_price_gbp'] = ''        
        
    elif len(supplier_search_elements) == 2:
        plants[i]['num_suppliers'] = supplier_search_elements[1].find("span").text.split()[0]
        plants[i]['supplier_search'] = supplier_search_elements[1]['href']
        plants[i]['rhsplants_url'] = supplier_search_elements[0]['href']
        plants[i]['rhsplants_price_gbp'] = supplier_search_elements[0].find("span").text.split('£')[1]
    else:
        plants[i]['num_suppliers'] = '0'
        plants[i]['supplier_search'] = ''
        plants[i]['rhsplants_url'] = ''
        plants[i]['rhsplants_price_gbp'] = ''        

In [26]:
dfplants = pd.DataFrame.from_dict(plants, orient='index', dtype='str')

In [291]:
dfplants.to_json(path_or_buf='plants.json',orient='table',index=False)

In [292]:
# example of how to read json back to pandas
# text_read = pd.read_json('plants.json',orient='table')
# text_read

In [293]:
driver.quit()

In [28]:
dfplants['detail_page'][0]

'/plants/96782/camellia-sasanqua-jean-may-/details-beta'