# Script to scrape additional info from individual plant page on RHS 'Find a Plant'

In [350]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from datetime import date
import json
import pandas as pd
from collections import defaultdict
import re

In [351]:
options = Options()
options.headless = True
DRIVER_PATH = './chromedriver_win32/chromedriver.exe'

In [363]:
def main():
    
    driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)
    #time.sleep(5)
    
    # Read input file
    infile = 'plants_short.txt'
    
    # read file
    with open(infile, 'r') as infile:
        indata=infile.read()

    # parse file
    plantlist = json.loads(indata)
    updatedplantlist = []
    n = len(plantlist['data'])
    
    for i,d in enumerate(plantlist['data']):
        driver.get('https://www.rhs.org.uk' + d['detail_page'])
            
        # Handle model pop-up and beta sign-in
        if driver.current_url.find('beta-optin') > 0:
            print('Opt-in')
            optin_button = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,'//button[@class="button button--ghost button--small button--w-100 button--w-auto-sm u-m-y-0"]/span[@class="button__text"][text()="Try the new version"]')))
            if check_exists_by_xpath(driver,'//span[@id="popupCloseTH"]'):
                close_button = driver.find_element_by_xpath('//span[@id="popupCloseTH"]')
                if close_button.is_displayed():
                    close_button.click() 
            optin_button.click()

        print('Processing plant ' + str(i+1) + ' of ' + str(n) + '. URL of webpage is: ' + str(driver.current_url))
        time.sleep(3)    
        soup = BeautifulSoup(driver.page_source, "html.parser")
        d.update(get_plant_details(soup))
        updatedplantlist.append(d)
            
    dfplants = pd.DataFrame(updatedplantlist)        
    dfplants.to_json(path_or_buf='testaddcols.txt',orient='table',index=False)
        
    driver.quit()
    

In [364]:
def check_exists_by_xpath(driver, xpath):
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True

In [365]:
def find_size(soup, string):
    result = soup.find(string=string)
    if result is None:
        return ''
    else:
        return result.parent.parent.text.split(string)[1].rstrip().lstrip()

In [366]:
def get_plant_details(soup):#detail_page):
# Get additional details for individual plant

    plantinfo = {}
    today = date.today().strftime("%d-%b-%Y")
    
    plantinfo["details_query_date"] = today 
    plantinfo["ultimate_height"] = find_size(soup=soup, string="Ultimate height")
    plantinfo["time_to_ultimate_height"] = find_size(soup=soup, string="Time to ultimate height")
    plantinfo["ultimate_spread"] = find_size(soup=soup, string="Ultimate spread")

    position_dict = {'Full sun':0, 'Full shade':0, 'Partial shade':0}
    for k in position_dict:
        result = soup.find(string=k)
        if result is None:
            position_dict[k] = 0
        else:
            position_dict[k] = 1
    plantinfo["sunlight_full_sun"] = position_dict['Full sun']
    plantinfo["sunlight_full_shade"] = position_dict['Full shade']
    plantinfo["sunlight_partial_shade"] = position_dict['Partial shade']

    h = soup.find("span", string = re.compile("^H[0-9]$"))
    if h is not None:
        plantinfo["rhs_hardiness_rating"] = h.string
    else:
        plantinfo["rhs_hardiness_rating"] = ''

    f = soup.find("dt", string="Foliage")
    if f is not None:
        plantinfo["foliage"] = f.parent.contents[1].contents[0].text
    else:
        plantinfo["foliage"] = ''

    l = soup.find_all("span", {"class": "ng-star-inserted"})

    plantinfo["moisture_moist_but_well_drained"] = 0
    plantinfo["moisture_well_drained"] = 0
    plantinfo["moisture_poorly_drained"] = 0

    for i in l:
        s = i.string
        if s is not None:
            if re.sub("[^a-zA-Z]+", "", s) == "Moistbutwelldrained":
                plantinfo["moisture_moist_but_well_drained"] = 1
            if re.sub("[^a-zA-Z]+", "", s) == "Welldrained":
                plantinfo["moisture_well_drained"] = 1
            if re.sub("[^a-zA-Z]+", "", s) == "Poorlydrained":
                plantinfo["moisture_poorly_drained"] = 1

    plantinfo["acidity_acid"] = 0
    plantinfo["acidity_neutral"] = 0
    plantinfo["acidity_alkaline"] = 0

    for i in l:
        s = i.string
        if s is not None:
            if re.sub("[^a-zA-Z]+", "", s) == "Acid":
                plantinfo["acidity_acid"] = 1
            if re.sub("[^a-zA-Z]+", "", s) == "Neutral":
                plantinfo["acidity_neutral"] = 1
            if re.sub("[^a-zA-Z]+", "", s) == "Alkaline":
                plantinfo["acidity_alkaline"] = 1

    return plantinfo

In [367]:
main()

Opt-in
Processing plant 1 of 4. URL of webpage is: https://www.rhs.org.uk/plants/96782/camellia-sasanqua-jean-may-/details-beta
Processing plant 2 of 4. URL of webpage is: https://www.rhs.org.uk/plants/87195/camellia-times-williamsii-les-jury-/details-beta
Processing plant 3 of 4. URL of webpage is: https://www.rhs.org.uk/plants/97484/camellia-times-williamsii-clarrie-fawcett-/details-beta
Processing plant 4 of 4. URL of webpage is: https://www.rhs.org.uk/plants/89149/camellia-sasanqua-hugh-evans-/details-beta
